feat(telegram): support media groups (multi-image messages) (#220)

This commit is contained in:
Ayaan Zaidi
2026-01-06 09:34:33 +05:30
committed by GitHub
parent fb2513e265
commit bd735182b6
3 changed files with 437 additions and 205 deletions

View File

@@ -10,6 +10,9 @@ export type MsgContext = {
MediaPath?: string; MediaPath?: string;
MediaUrl?: string; MediaUrl?: string;
MediaType?: string; MediaType?: string;
MediaPaths?: string[];
MediaUrls?: string[];
MediaTypes?: string[];
Transcript?: string; Transcript?: string;
ChatType?: string; ChatType?: string;
GroupSubject?: string; GroupSubject?: string;

View File

@@ -209,3 +209,135 @@ describe("telegram inbound media", () => {
fetchSpy.mockRestore(); fetchSpy.mockRestore();
}); });
}); });
describe("telegram media groups", () => {
const waitForMediaGroupProcessing = () =>
new Promise((resolve) => setTimeout(resolve, 600));
it("buffers messages with same media_group_id and processes them together", async () => {
const { createTelegramBot } = await import("./bot.js");
const replyModule = await import("../auto-reply/reply.js");
const replySpy = replyModule.__replySpy as unknown as ReturnType<
typeof vi.fn
>;
onSpy.mockReset();
replySpy.mockReset();
const runtimeError = vi.fn();
const fetchSpy = vi.spyOn(globalThis, "fetch" as never).mockResolvedValue({
ok: true,
status: 200,
statusText: "OK",
headers: { get: () => "image/png" },
arrayBuffer: async () => new Uint8Array([0x89, 0x50, 0x4e, 0x47]).buffer,
} as Response);
createTelegramBot({
token: "tok",
runtime: {
log: vi.fn(),
error: runtimeError,
exit: () => {
throw new Error("exit");
},
},
});
const handler = onSpy.mock.calls[0][1] as (
ctx: Record<string, unknown>,
) => Promise<void>;
await handler({
message: {
chat: { id: 42, type: "private" },
message_id: 1,
caption: "Here are my photos",
date: 1736380800,
media_group_id: "album123",
photo: [{ file_id: "photo1" }],
},
me: { username: "clawdbot_bot" },
getFile: async () => ({ file_path: "photos/photo1.jpg" }),
});
await handler({
message: {
chat: { id: 42, type: "private" },
message_id: 2,
date: 1736380801,
media_group_id: "album123",
photo: [{ file_id: "photo2" }],
},
me: { username: "clawdbot_bot" },
getFile: async () => ({ file_path: "photos/photo2.jpg" }),
});
expect(replySpy).not.toHaveBeenCalled();
await waitForMediaGroupProcessing();
expect(runtimeError).not.toHaveBeenCalled();
expect(replySpy).toHaveBeenCalledTimes(1);
const payload = replySpy.mock.calls[0][0];
expect(payload.Body).toContain("Here are my photos");
expect(payload.MediaPaths).toHaveLength(2);
fetchSpy.mockRestore();
}, 2000);
it("processes separate media groups independently", async () => {
const { createTelegramBot } = await import("./bot.js");
const replyModule = await import("../auto-reply/reply.js");
const replySpy = replyModule.__replySpy as unknown as ReturnType<
typeof vi.fn
>;
onSpy.mockReset();
replySpy.mockReset();
const fetchSpy = vi.spyOn(globalThis, "fetch" as never).mockResolvedValue({
ok: true,
status: 200,
statusText: "OK",
headers: { get: () => "image/png" },
arrayBuffer: async () => new Uint8Array([0x89, 0x50, 0x4e, 0x47]).buffer,
} as Response);
createTelegramBot({ token: "tok" });
const handler = onSpy.mock.calls[0][1] as (
ctx: Record<string, unknown>,
) => Promise<void>;
await handler({
message: {
chat: { id: 42, type: "private" },
message_id: 1,
caption: "Album A",
date: 1736380800,
media_group_id: "albumA",
photo: [{ file_id: "photoA1" }],
},
me: { username: "clawdbot_bot" },
getFile: async () => ({ file_path: "photos/photoA1.jpg" }),
});
await handler({
message: {
chat: { id: 42, type: "private" },
message_id: 2,
caption: "Album B",
date: 1736380801,
media_group_id: "albumB",
photo: [{ file_id: "photoB1" }],
},
me: { username: "clawdbot_bot" },
getFile: async () => ({ file_path: "photos/photoB1.jpg" }),
});
expect(replySpy).not.toHaveBeenCalled();
await waitForMediaGroupProcessing();
expect(replySpy).toHaveBeenCalledTimes(2);
fetchSpy.mockRestore();
}, 2000);
});

View File

@@ -34,8 +34,20 @@ import { loadWebMedia } from "../web/media.js";
const PARSE_ERR_RE = const PARSE_ERR_RE =
/can't parse entities|parse entities|find end of the entity/i; /can't parse entities|parse entities|find end of the entity/i;
// Media group aggregation - Telegram sends multi-image messages as separate updates
// with a shared media_group_id. We buffer them and process as a single message after a short delay.
const MEDIA_GROUP_TIMEOUT_MS = 500;
type TelegramMessage = Message.CommonMessage; type TelegramMessage = Message.CommonMessage;
type MediaGroupEntry = {
messages: Array<{
msg: TelegramMessage;
ctx: TelegramContext;
}>;
timer: ReturnType<typeof setTimeout>;
};
type TelegramContext = { type TelegramContext = {
message: TelegramMessage; message: TelegramMessage;
me?: { username?: string }; me?: { username?: string };
@@ -69,6 +81,8 @@ export function createTelegramBot(opts: TelegramBotOptions) {
const bot = new Bot(opts.token, { client }); const bot = new Bot(opts.token, { client });
bot.api.config.use(apiThrottler()); bot.api.config.use(apiThrottler());
const mediaGroupBuffer = new Map<string, MediaGroupEntry>();
const cfg = loadConfig(); const cfg = loadConfig();
const textLimit = resolveTextChunkLimit(cfg, "telegram"); const textLimit = resolveTextChunkLimit(cfg, "telegram");
const allowFrom = opts.allowFrom ?? cfg.telegram?.allowFrom; const allowFrom = opts.allowFrom ?? cfg.telegram?.allowFrom;
@@ -94,24 +108,13 @@ export function createTelegramBot(opts: TelegramBotOptions) {
overrideOrder: "after-config", overrideOrder: "after-config",
}); });
bot.on("message", async (ctx) => { const processMessage = async (
try { primaryCtx: TelegramContext,
const msg = ctx.message; allMedia: Array<{ path: string; contentType?: string }>,
if (!msg) return; ) => {
const msg = primaryCtx.message;
const chatId = msg.chat.id; const chatId = msg.chat.id;
const isGroup = const isGroup = msg.chat.type === "group" || msg.chat.type === "supergroup";
msg.chat.type === "group" || msg.chat.type === "supergroup";
if (isGroup) {
const groupPolicy = resolveGroupPolicy(chatId);
if (groupPolicy.allowlistEnabled && !groupPolicy.allowed) {
logger.info(
{ chatId, title: msg.chat.title, reason: "not-allowed" },
"skipping group message",
);
return;
}
}
const sendTyping = async () => { const sendTyping = async () => {
try { try {
@@ -140,7 +143,7 @@ export function createTelegramBot(opts: TelegramBotOptions) {
} }
} }
const botUsername = ctx.me?.username?.toLowerCase(); const botUsername = primaryCtx.me?.username?.toLowerCase();
const allowFromList = Array.isArray(allowFrom) const allowFromList = Array.isArray(allowFrom)
? allowFrom.map((entry) => String(entry).trim()).filter(Boolean) ? allowFrom.map((entry) => String(entry).trim()).filter(Boolean)
: []; : [];
@@ -171,32 +174,15 @@ export function createTelegramBot(opts: TelegramBotOptions) {
!hasAnyMention && !hasAnyMention &&
commandAuthorized && commandAuthorized &&
hasControlCommand(msg.text ?? msg.caption ?? ""); hasControlCommand(msg.text ?? msg.caption ?? "");
const canDetectMention = const canDetectMention = Boolean(botUsername) || mentionRegexes.length > 0;
Boolean(botUsername) || mentionRegexes.length > 0;
if (isGroup && requireMention && canDetectMention) { if (isGroup && requireMention && canDetectMention) {
if (!wasMentioned && !shouldBypassMention) { if (!wasMentioned && !shouldBypassMention) {
logger.info( logger.info({ chatId, reason: "no-mention" }, "skipping group message");
{ chatId, reason: "no-mention" },
"skipping group message",
);
return; return;
} }
} }
const media = await resolveMedia( // ACK reactions
ctx,
mediaMaxBytes,
opts.token,
opts.proxyFetch,
);
const replyTarget = describeReplyTarget(msg);
const rawBody = (
msg.text ??
msg.caption ??
media?.placeholder ??
""
).trim();
if (!rawBody) return;
const shouldAckReaction = () => { const shouldAckReaction = () => {
if (!ackReaction) return false; if (!ackReaction) return false;
if (ackReactionScope === "all") return true; if (ackReactionScope === "all") return true;
@@ -204,7 +190,7 @@ export function createTelegramBot(opts: TelegramBotOptions) {
if (ackReactionScope === "group-all") return isGroup; if (ackReactionScope === "group-all") return isGroup;
if (ackReactionScope === "group-mentions") { if (ackReactionScope === "group-mentions") {
if (!isGroup) return false; if (!isGroup) return false;
if (!resolveGroupRequireMention(chatId)) return false; if (!requireMention) return false;
if (!canDetectMention) return false; if (!canDetectMention) return false;
return wasMentioned || shouldBypassMention; return wasMentioned || shouldBypassMention;
} }
@@ -230,8 +216,26 @@ export function createTelegramBot(opts: TelegramBotOptions) {
}); });
} }
} }
let placeholder = "";
if (msg.photo) placeholder = "<media:image>";
else if (msg.video) placeholder = "<media:video>";
else if (msg.audio || msg.voice) placeholder = "<media:audio>";
else if (msg.document) placeholder = "<media:document>";
const replyTarget = describeReplyTarget(msg);
const rawBody = (msg.text ?? msg.caption ?? placeholder).trim();
if (!rawBody && allMedia.length === 0) return;
let bodyText = rawBody;
if (!bodyText && allMedia.length > 0) {
bodyText = `<media:image>${allMedia.length > 1 ? ` (${allMedia.length} images)` : ""}`;
}
const replySuffix = replyTarget const replySuffix = replyTarget
? `\n\n[Replying to ${replyTarget.sender}${replyTarget.id ? ` id:${replyTarget.id}` : ""}]\n${replyTarget.body}\n[/Replying]` ? `\n\n[Replying to ${replyTarget.sender}${
replyTarget.id ? ` id:${replyTarget.id}` : ""
}]\n${replyTarget.body}\n[/Replying]`
: ""; : "";
const body = formatAgentEnvelope({ const body = formatAgentEnvelope({
surface: "Telegram", surface: "Telegram",
@@ -239,7 +243,7 @@ export function createTelegramBot(opts: TelegramBotOptions) {
? buildGroupLabel(msg, chatId) ? buildGroupLabel(msg, chatId)
: buildSenderLabel(msg, chatId), : buildSenderLabel(msg, chatId),
timestamp: msg.date ? msg.date * 1000 : undefined, timestamp: msg.date ? msg.date * 1000 : undefined,
body: `${rawBody}${replySuffix}`, body: `${bodyText}${replySuffix}`,
}); });
const ctxPayload = { const ctxPayload = {
@@ -258,9 +262,15 @@ export function createTelegramBot(opts: TelegramBotOptions) {
ReplyToSender: replyTarget?.sender, ReplyToSender: replyTarget?.sender,
Timestamp: msg.date ? msg.date * 1000 : undefined, Timestamp: msg.date ? msg.date * 1000 : undefined,
WasMentioned: isGroup ? wasMentioned : undefined, WasMentioned: isGroup ? wasMentioned : undefined,
MediaPath: media?.path, MediaPath: allMedia[0]?.path,
MediaType: media?.contentType, MediaType: allMedia[0]?.contentType,
MediaUrl: media?.path, MediaUrl: allMedia[0]?.path,
MediaPaths: allMedia.length > 0 ? allMedia.map((m) => m.path) : undefined,
MediaUrls: allMedia.length > 0 ? allMedia.map((m) => m.path) : undefined,
MediaTypes:
allMedia.length > 0
? (allMedia.map((m) => m.contentType).filter(Boolean) as string[])
: undefined,
CommandAuthorized: commandAuthorized, CommandAuthorized: commandAuthorized,
}; };
@@ -285,8 +295,10 @@ export function createTelegramBot(opts: TelegramBotOptions) {
if (shouldLogVerbose()) { if (shouldLogVerbose()) {
const preview = body.slice(0, 200).replace(/\n/g, "\\n"); const preview = body.slice(0, 200).replace(/\n/g, "\\n");
const mediaInfo =
allMedia.length > 1 ? ` mediaCount=${allMedia.length}` : "";
logVerbose( logVerbose(
`telegram inbound: chatId=${chatId} from=${ctxPayload.From} len=${body.length} preview="${preview}"`, `telegram inbound: chatId=${chatId} from=${ctxPayload.From} len=${body.length}${mediaInfo} preview="${preview}"`,
); );
} }
@@ -327,11 +339,96 @@ export function createTelegramBot(opts: TelegramBotOptions) {
}); });
typingController?.markDispatchIdle(); typingController?.markDispatchIdle();
if (!queuedFinal) return; if (!queuedFinal) return;
};
bot.on("message", async (ctx) => {
try {
const msg = ctx.message;
if (!msg) return;
const chatId = msg.chat.id;
const isGroup =
msg.chat.type === "group" || msg.chat.type === "supergroup";
// Group policy check - skip disallowed groups early
if (isGroup) {
const groupPolicy = resolveGroupPolicy(chatId);
if (groupPolicy.allowlistEnabled && !groupPolicy.allowed) {
logger.info(
{ chatId, title: msg.chat.title, reason: "not-allowed" },
"skipping group message",
);
return;
}
}
// Media group handling - buffer multi-image messages
const mediaGroupId = (msg as { media_group_id?: string }).media_group_id;
if (mediaGroupId) {
const existing = mediaGroupBuffer.get(mediaGroupId);
if (existing) {
clearTimeout(existing.timer);
existing.messages.push({ msg, ctx });
existing.timer = setTimeout(async () => {
mediaGroupBuffer.delete(mediaGroupId);
await processMediaGroup(existing);
}, MEDIA_GROUP_TIMEOUT_MS);
} else {
const entry: MediaGroupEntry = {
messages: [{ msg, ctx }],
timer: setTimeout(async () => {
mediaGroupBuffer.delete(mediaGroupId);
await processMediaGroup(entry);
}, MEDIA_GROUP_TIMEOUT_MS),
};
mediaGroupBuffer.set(mediaGroupId, entry);
}
return;
}
const media = await resolveMedia(
ctx,
mediaMaxBytes,
opts.token,
opts.proxyFetch,
);
const allMedia = media
? [{ path: media.path, contentType: media.contentType }]
: [];
await processMessage(ctx, allMedia);
} catch (err) { } catch (err) {
runtime.error?.(danger(`handler failed: ${String(err)}`)); runtime.error?.(danger(`handler failed: ${String(err)}`));
} }
}); });
const processMediaGroup = async (entry: MediaGroupEntry) => {
try {
entry.messages.sort((a, b) => a.msg.message_id - b.msg.message_id);
const captionMsg = entry.messages.find(
(m) => m.msg.caption || m.msg.text,
);
const primaryEntry = captionMsg ?? entry.messages[0];
const allMedia: Array<{ path: string; contentType?: string }> = [];
for (const { ctx } of entry.messages) {
const media = await resolveMedia(
ctx,
mediaMaxBytes,
opts.token,
opts.proxyFetch,
);
if (media) {
allMedia.push({ path: media.path, contentType: media.contentType });
}
}
await processMessage(primaryEntry.ctx, allMedia);
} catch (err) {
runtime.error?.(danger(`media group handler failed: ${String(err)}`));
}
};
return bot; return bot;
} }