fix: render Telegram media captions

2026-01-24 03:39:21 +00:00
parent d57cb2e1a8
commit de2d986008
10 changed files with 176 additions and 80 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,7 @@ Docs: https://docs.clawd.bot
 - CLI: explain when auth profiles are excluded by auth.order in probe details.
 - CLI: drop the em dash when the banner tagline wraps to a second line.
 - CLI: inline auth probe errors in status rows to reduce wrapping.
 - Telegram: render markdown in media captions. (#1478)
 - Agents: honor enqueue overrides for embedded runs to avoid queue deadlocks in tests.
 - Daemon: use platform PATH delimiters when building minimal service paths.
 - Tests: skip embedded runner ordering assertion on Windows to avoid CI timeouts.
--- a/src/telegram/bot.create-telegram-bot.routes-dms-by-telegram-accountid-binding.test.ts
+++ b/src/telegram/bot.create-telegram-bot.routes-dms-by-telegram-accountid-binding.test.ts
@@ -363,6 +363,7 @@ describe("createTelegramBot", () => {
    expect(sendAnimationSpy).toHaveBeenCalledTimes(1);
    expect(sendAnimationSpy).toHaveBeenCalledWith("1234", expect.anything(), {
      caption: "caption",
      parse_mode: "HTML",
      reply_to_message_id: undefined,
    });
    expect(sendPhotoSpy).not.toHaveBeenCalled();
--- a/src/telegram/bot.test.ts
+++ b/src/telegram/bot.test.ts
@@ -1392,6 +1392,7 @@ describe("createTelegramBot", () => {
    expect(sendAnimationSpy).toHaveBeenCalledTimes(1);
    expect(sendAnimationSpy).toHaveBeenCalledWith("1234", expect.anything(), {
      caption: "caption",
      parse_mode: "HTML",
      reply_to_message_id: undefined,
    });
    expect(sendPhotoSpy).not.toHaveBeenCalled();
--- a/src/telegram/bot/delivery.test.ts
+++ b/src/telegram/bot/delivery.test.ts
@@ -74,4 +74,38 @@ describe("deliverReplies", () => {
    expect(sendVoice).toHaveBeenCalledTimes(1);
    expect(events).toEqual(["recordVoice", "sendVoice"]);
  });
  it("renders markdown in media captions", async () => {
    const runtime = { error: vi.fn(), log: vi.fn() };
    const sendPhoto = vi.fn().mockResolvedValue({
      message_id: 2,
      chat: { id: "123" },
    });
    const bot = { api: { sendPhoto } } as unknown as Bot;
    loadWebMedia.mockResolvedValueOnce({
      buffer: Buffer.from("image"),
      contentType: "image/jpeg",
      fileName: "photo.jpg",
    });
    await deliverReplies({
      replies: [{ mediaUrl: "https://example.com/photo.jpg", text: "hi **boss**" }],
      chatId: "123",
      token: "tok",
      runtime,
      bot,
      replyToMode: "off",
      textLimit: 4000,
    });
    expect(sendPhoto).toHaveBeenCalledWith(
      "123",
      expect.anything(),
      expect.objectContaining({
        caption: "hi <b>boss</b>",
        parse_mode: "HTML",
      }),
    );
  });
 });
--- a/src/telegram/bot/delivery.ts
+++ b/src/telegram/bot/delivery.ts
@@ -1,5 +1,9 @@
 import { type Bot, InputFile } from "grammy";
-import { markdownToTelegramChunks, markdownToTelegramHtml } from "../format.js";
+import {
  markdownToTelegramChunks,
  markdownToTelegramHtml,
  renderTelegramHtmlText,
 } from "../format.js";
 import { splitTelegramCaption } from "../caption.js";
 import type { ReplyPayload } from "../../auto-reply/types.js";
 import type { ReplyToMode } from "../../config/config.js";
@@ -87,6 +91,9 @@ export async function deliverReplies(params: {
      const { caption, followUpText } = splitTelegramCaption(
        isFirstMedia ? (reply.text ?? undefined) : undefined,
      );
      const htmlCaption = caption
        ? renderTelegramHtmlText(caption, { tableMode: params.tableMode })
        : undefined;
      if (followUpText) {
        pendingFollowUpText = followUpText;
      }
@@ -94,8 +101,9 @@ export async function deliverReplies(params: {
      const replyToMessageId =
        replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined;
      const mediaParams: Record<string, unknown> = {
-        caption,
+        caption: htmlCaption,
        reply_to_message_id: replyToMessageId,
        ...(htmlCaption ? { parse_mode: "HTML" } : {}),
      };
      if (threadParams) {
        mediaParams.message_thread_id = threadParams.message_thread_id;
@@ -149,14 +157,12 @@ export async function deliverReplies(params: {
        for (const chunk of chunks) {
          const replyToMessageIdFollowup =
            replyToId && (replyToMode === "all" || !hasReplied) ? replyToId : undefined;
-          await bot.api.sendMessage(
+          await sendTelegramText(bot, chatId, chunk.html, runtime, {
-            chatId,
+            replyToMessageId: replyToMessageIdFollowup,
-            chunk.text,
+            messageThreadId,
-            buildTelegramSendParams({
+            textMode: "html",
-              replyToMessageId: replyToMessageIdFollowup,
+            plainText: chunk.text,
-              messageThreadId,
+          });
            }),
          );
          if (replyToId && !hasReplied) {
            hasReplied = true;
          }
--- a/src/telegram/format.ts
+++ b/src/telegram/format.ts
@@ -60,6 +60,15 @@ export function markdownToTelegramHtml(
  return renderTelegramHtml(ir);
 }
 export function renderTelegramHtmlText(
  text: string,
  options: { textMode?: "markdown" | "html"; tableMode?: MarkdownTableMode } = {},
 ): string {
  const textMode = options.textMode ?? "markdown";
  if (textMode === "html") return text;
  return markdownToTelegramHtml(text, { tableMode: options.tableMode });
 }
 export function markdownToTelegramChunks(
  markdown: string,
  limit: number,
--- a/src/telegram/send.caption-split.test.ts
+++ b/src/telegram/send.caption-split.test.ts
@@ -87,8 +87,10 @@ describe("sendMessageTelegram caption splitting", () => {
    expect(sendPhoto).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: undefined,
    });
-    // Then text sent as separate message (plain text, matching caption behavior)
+    // Then text sent as separate message (HTML formatting)
-    expect(sendMessage).toHaveBeenCalledWith(chatId, longText);
+    expect(sendMessage).toHaveBeenCalledWith(chatId, longText, {
      parse_mode: "HTML",
    });
    // Returns the text message ID (the "main" content)
    expect(res.messageId).toBe("71");
  });
@@ -123,12 +125,43 @@ describe("sendMessageTelegram caption splitting", () => {
    // Caption should be included with media
    expect(sendPhoto).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: shortText,
      parse_mode: "HTML",
    });
    // No separate text message needed
    expect(sendMessage).not.toHaveBeenCalled();
    expect(res.messageId).toBe("72");
  });
  it("renders markdown in media captions", async () => {
    const chatId = "123";
    const caption = "hi **boss**";
    const sendPhoto = vi.fn().mockResolvedValue({
      message_id: 90,
      chat: { id: chatId },
    });
    const api = { sendPhoto } as unknown as {
      sendPhoto: typeof sendPhoto;
    };
    loadWebMedia.mockResolvedValueOnce({
      buffer: Buffer.from("fake-image"),
      contentType: "image/jpeg",
      fileName: "photo.jpg",
    });
    await sendMessageTelegram(chatId, caption, {
      token: "tok",
      api,
      mediaUrl: "https://example.com/photo.jpg",
    });
    expect(sendPhoto).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: "hi <b>boss</b>",
      parse_mode: "HTML",
    });
  });
  it("preserves thread params when splitting long captions", async () => {
    const chatId = "-1001234567890";
    const longText = "C".repeat(1100);
@@ -166,8 +199,9 @@ describe("sendMessageTelegram caption splitting", () => {
      message_thread_id: 271,
      reply_to_message_id: 500,
    });
-    // Text message also includes thread params (plain text, matching caption behavior)
+    // Text message also includes thread params (HTML formatting)
    expect(sendMessage).toHaveBeenCalledWith(chatId, longText, {
      parse_mode: "HTML",
      message_thread_id: 271,
      reply_to_message_id: 500,
    });
@@ -209,6 +243,7 @@ describe("sendMessageTelegram caption splitting", () => {
    });
    // Follow-up text has the reply_markup
    expect(sendMessage).toHaveBeenCalledWith(chatId, longText, {
      parse_mode: "HTML",
      reply_markup: {
        inline_keyboard: [[{ text: "Click me", callback_data: "action:click" }]],
      },
@@ -253,6 +288,7 @@ describe("sendMessageTelegram caption splitting", () => {
      reply_to_message_id: 500,
    });
    expect(sendMessage).toHaveBeenCalledWith(chatId, longText, {
      parse_mode: "HTML",
      message_thread_id: 271,
      reply_to_message_id: 500,
      reply_markup: {
@@ -353,6 +389,7 @@ describe("sendMessageTelegram caption splitting", () => {
    // Media sent WITH reply_markup when not splitting
    expect(sendPhoto).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: shortText,
      parse_mode: "HTML",
      reply_markup: {
        inline_keyboard: [[{ text: "Click me", callback_data: "action:click" }]],
      },
--- a/src/telegram/send.preserves-thread-params-plain-text-fallback.test.ts
+++ b/src/telegram/send.preserves-thread-params-plain-text-fallback.test.ts
@@ -94,6 +94,7 @@ describe("buildInlineKeyboard", () => {
    expect(sendPhoto).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: "photo in topic",
      parse_mode: "HTML",
      message_thread_id: 99,
    });
  });
--- a/src/telegram/send.returns-undefined-empty-input.test.ts
+++ b/src/telegram/send.returns-undefined-empty-input.test.ts
@@ -285,6 +285,7 @@ describe("sendMessageTelegram", () => {
    expect(sendAnimation).toHaveBeenCalledTimes(1);
    expect(sendAnimation).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: "caption",
      parse_mode: "HTML",
    });
    expect(res.messageId).toBe("9");
  });
@@ -318,6 +319,7 @@ describe("sendMessageTelegram", () => {
    expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: "caption",
      parse_mode: "HTML",
    });
    expect(sendVoice).not.toHaveBeenCalled();
  });
@@ -354,6 +356,7 @@ describe("sendMessageTelegram", () => {
    expect(sendVoice).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: "voice note",
      parse_mode: "HTML",
      message_thread_id: 271,
      reply_to_message_id: 500,
    });
@@ -390,6 +393,7 @@ describe("sendMessageTelegram", () => {
    expect(sendAudio).toHaveBeenCalledWith(chatId, expect.anything(), {
      caption: "caption",
      parse_mode: "HTML",
    });
    expect(sendVoice).not.toHaveBeenCalled();
  });
--- a/src/telegram/send.ts
+++ b/src/telegram/send.ts
@@ -16,7 +16,7 @@ import { isGifMedia } from "../media/mime.js";
 import { loadWebMedia } from "../web/media.js";
 import { resolveTelegramAccount } from "./accounts.js";
 import { resolveTelegramFetch } from "./fetch.js";
-import { markdownToTelegramHtml } from "./format.js";
+import { renderTelegramHtmlText } from "./format.js";
 import { resolveMarkdownTableMode } from "../config/markdown-tables.js";
 import { splitTelegramCaption } from "./caption.js";
 import { recordSentMessage } from "./sent-message-cache.js";
@@ -190,6 +190,55 @@ export async function sendMessageTelegram(
    );
  };
  const textMode = opts.textMode ?? "markdown";
  const tableMode = resolveMarkdownTableMode({
    cfg,
    channel: "telegram",
    accountId: account.accountId,
  });
  const renderHtmlText = (value: string) => renderTelegramHtmlText(value, { textMode, tableMode });
  const sendTelegramText = async (
    rawText: string,
    params?: Record<string, unknown>,
    fallbackText?: string,
  ) => {
    const htmlText = renderHtmlText(rawText);
    const sendParams = params
      ? {
          parse_mode: "HTML" as const,
          ...params,
        }
      : {
          parse_mode: "HTML" as const,
        };
    const res = await request(() => api.sendMessage(chatId, htmlText, sendParams), "message").catch(
      async (err) => {
        // Telegram rejects malformed HTML (e.g., unsupported tags or entities).
        // When that happens, fall back to plain text so the message still delivers.
        const errText = formatErrorMessage(err);
        if (PARSE_ERR_RE.test(errText)) {
          if (opts.verbose) {
            console.warn(`telegram HTML parse failed, retrying as plain text: ${errText}`);
          }
          const fallback = fallbackText ?? rawText;
          const plainParams = params && Object.keys(params).length > 0 ? { ...params } : undefined;
          return await request(
            () =>
              plainParams
                ? api.sendMessage(chatId, fallback, plainParams)
                : api.sendMessage(chatId, fallback),
            "message-plain",
          ).catch((err2) => {
            throw wrapChatNotFound(err2);
          });
        }
        throw wrapChatNotFound(err);
      },
    );
    return res;
  };
  if (mediaUrl) {
    const media = await loadWebMedia(mediaUrl, opts.maxBytes);
    const kind = mediaKindFromMime(media.contentType ?? undefined);
@@ -200,21 +249,21 @@ export async function sendMessageTelegram(
    const fileName = media.fileName ?? (isGif ? "animation.gif" : inferFilename(kind)) ?? "file";
    const file = new InputFile(media.buffer, fileName);
    const { caption, followUpText } = splitTelegramCaption(text);
    const htmlCaption = caption ? renderHtmlText(caption) : undefined;
    // If text exceeds Telegram's caption limit, send media without caption
    // then send text as a separate follow-up message.
    const needsSeparateText = Boolean(followUpText);
    // When splitting, put reply_markup only on the follow-up text (the "main" content),
    // not on the media message.
-    const mediaParams = hasThreadParams
+    const baseMediaParams = {
-      ? {
+      ...(hasThreadParams ? threadParams : {}),
-          caption,
+      ...(!needsSeparateText && replyMarkup ? { reply_markup: replyMarkup } : {}),
-          ...threadParams,
+    };
-          ...(!needsSeparateText && replyMarkup ? { reply_markup: replyMarkup } : {}),
+    const mediaParams = {
-        }
+      caption: htmlCaption,
-      : {
+      ...(htmlCaption ? { parse_mode: "HTML" as const } : {}),
-          caption,
+      ...baseMediaParams,
-          ...(!needsSeparateText && replyMarkup ? { reply_markup: replyMarkup } : {}),
+    };
        };
    let result:
      | Awaited<ReturnType<typeof api.sendPhoto>>
      | Awaited<ReturnType<typeof api.sendVideo>>
@@ -279,7 +328,7 @@ export async function sendMessageTelegram(
    });
    // If text was too long for a caption, send it as a separate follow-up message.
-    // Use plain text to match caption behavior (captions don't use HTML conversion).
+    // Use HTML conversion so markdown renders like captions.
    if (needsSeparateText && followUpText) {
      const textParams =
        hasThreadParams || replyMarkup
@@ -288,15 +337,7 @@ export async function sendMessageTelegram(
              ...(replyMarkup ? { reply_markup: replyMarkup } : {}),
            }
          : undefined;
-      const textRes = await request(
+      const textRes = await sendTelegramText(followUpText, textParams);
        () =>
          textParams
            ? api.sendMessage(chatId, followUpText, textParams)
            : api.sendMessage(chatId, followUpText),
        "message",
      ).catch((err) => {
        throw wrapChatNotFound(err);
      });
      // Return the text message ID as the "main" message (it's the actual content).
      return {
        messageId: String(textRes?.message_id ?? mediaMessageId),
@@ -310,53 +351,14 @@ export async function sendMessageTelegram(
  if (!text || !text.trim()) {
    throw new Error("Message must be non-empty for Telegram sends");
  }
-  const textMode = opts.textMode ?? "markdown";
+  const textParams =
-  const tableMode = resolveMarkdownTableMode({
+    hasThreadParams || replyMarkup
-    cfg,
+      ? {
-    channel: "telegram",
+          ...threadParams,
-    accountId: account.accountId,
+          ...(replyMarkup ? { reply_markup: replyMarkup } : {}),
  });
  const htmlText = textMode === "html" ? text : markdownToTelegramHtml(text, { tableMode });
  const textParams = hasThreadParams
    ? {
        parse_mode: "HTML" as const,
        ...threadParams,
        ...(replyMarkup ? { reply_markup: replyMarkup } : {}),
      }
    : {
        parse_mode: "HTML" as const,
        ...(replyMarkup ? { reply_markup: replyMarkup } : {}),
      };
  const res = await request(() => api.sendMessage(chatId, htmlText, textParams), "message").catch(
    async (err) => {
      // Telegram rejects malformed HTML (e.g., unsupported tags or entities).
      // When that happens, fall back to plain text so the message still delivers.
      const errText = formatErrorMessage(err);
      if (PARSE_ERR_RE.test(errText)) {
        if (opts.verbose) {
          console.warn(`telegram HTML parse failed, retrying as plain text: ${errText}`);
        }
-        const plainParams =
+      : undefined;
-          hasThreadParams || replyMarkup
+  const res = await sendTelegramText(text, textParams, opts.plainText);
            ? {
                ...threadParams,
                ...(replyMarkup ? { reply_markup: replyMarkup } : {}),
              }
            : undefined;
        const fallbackText = opts.plainText ?? text;
        return await request(
          () =>
            plainParams
              ? api.sendMessage(chatId, fallbackText, plainParams)
              : api.sendMessage(chatId, fallbackText),
          "message-plain",
        ).catch((err2) => {
          throw wrapChatNotFound(err2);
        });
      }
      throw wrapChatNotFound(err);
    },
  );
  const messageId = String(res?.message_id ?? "unknown");
  if (res?.message_id) {
    recordSentMessage(chatId, res.message_id);