From 34fea720f8bb2da6b87825c462e48616ab67f194 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Tue, 27 Jan 2026 12:47:04 +0530 Subject: [PATCH] fix(telegram): improve sticker vision + cache (#2548) (thanks @longjos) --- CHANGELOG.md | 1 + docs/channels/telegram.md | 18 ++-- src/agents/tools/telegram-actions.test.ts | 40 ++++++++ src/agents/tools/telegram-actions.ts | 4 +- src/channels/plugins/actions/telegram.test.ts | 7 ++ src/channels/plugins/actions/telegram.ts | 2 +- src/media-understanding/runner.ts | 33 +++++++ src/telegram/bot-message-dispatch.ts | 1 + ...s-media-file-path-no-file-download.test.ts | 97 +++++++++++++++++++ src/telegram/bot/delivery.ts | 22 ++++- src/telegram/sticker-cache.ts | 52 +++++----- 11 files changed, 240 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6bb640bc..442dd52a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -53,6 +53,7 @@ Status: unreleased. - Telegram: keep topic IDs in restart sentinel notifications. (#1807) Thanks @hsrvc. - Telegram: add optional silent send flag (disable notifications). (#2382) Thanks @Suksham-sharma. - Telegram: support editing sent messages via message(action="edit"). (#2394) Thanks @marcelomar21. +- Telegram: add sticker receive/send with vision caching. (#2548) Thanks @longjos. - Config: apply config.env before ${VAR} substitution. (#1813) Thanks @spanishflu-est1918. - Slack: clear ack reaction after streamed replies. (#2044) Thanks @fancyboi999. - macOS: keep custom SSH usernames in remote target. (#2046) Thanks @algal. diff --git a/docs/channels/telegram.md b/docs/channels/telegram.md index 2d8c472bd..56920f131 100644 --- a/docs/channels/telegram.md +++ b/docs/channels/telegram.md @@ -395,10 +395,13 @@ When a user sends a sticker, Clawdbot handles it based on the sticker type: - **Animated stickers (TGS):** Skipped (Lottie format not supported for processing). - **Video stickers (WEBM):** Skipped (video format not supported for processing). -Template context fields available when receiving stickers: -- `StickerEmoji` — the emoji associated with the sticker -- `StickerSetName` — the name of the sticker set -- `StickerFileId` — the Telegram file ID (used for sending the same sticker back) +Template context field available when receiving stickers: +- `Sticker` — object with: + - `emoji` — emoji associated with the sticker + - `setName` — name of the sticker set + - `fileId` — Telegram file ID (send the same sticker back) + - `fileUniqueId` — stable ID for cache lookup + - `cachedDescription` — cached vision description when available ### Sticker cache @@ -416,10 +419,11 @@ Stickers are processed through the AI's vision capabilities to generate descript ```json { "fileId": "CAACAgIAAxkBAAI...", + "fileUniqueId": "AgADBAADb6cxG2Y", "emoji": "👋", "setName": "CoolCats", "description": "A cartoon cat waving enthusiastically", - "addedAt": "2026-01-15T10:30:00.000Z" + "cachedAt": "2026-01-15T10:30:00.000Z" } ``` @@ -458,7 +462,7 @@ The agent can send and search stickers using the `sticker` and `sticker-search` ``` Parameters: -- `fileId` (required) — the Telegram file ID of the sticker. Obtain this from `StickerFileId` when receiving a sticker, or from a `sticker-search` result. +- `fileId` (required) — the Telegram file ID of the sticker. Obtain this from `Sticker.fileId` when receiving a sticker, or from a `sticker-search` result. - `replyTo` (optional) — message ID to reply to. - `threadId` (optional) — message thread ID for forum topics. @@ -543,7 +547,7 @@ Outbound Telegram API calls retry on transient network/429 errors with exponenti - Tool: `telegram` with `react` action (`chatId`, `messageId`, `emoji`). - Tool: `telegram` with `deleteMessage` action (`chatId`, `messageId`). - Reaction removal semantics: see [/tools/reactions](/tools/reactions). -- Tool gating: `channels.telegram.actions.reactions`, `channels.telegram.actions.sendMessage`, `channels.telegram.actions.deleteMessage` (default: enabled). +- Tool gating: `channels.telegram.actions.reactions`, `channels.telegram.actions.sendMessage`, `channels.telegram.actions.deleteMessage` (default: enabled), and `channels.telegram.actions.sticker` (default: disabled). ## Reaction notifications diff --git a/src/agents/tools/telegram-actions.test.ts b/src/agents/tools/telegram-actions.test.ts index 5c0629e38..db276849b 100644 --- a/src/agents/tools/telegram-actions.test.ts +++ b/src/agents/tools/telegram-actions.test.ts @@ -8,12 +8,17 @@ const sendMessageTelegram = vi.fn(async () => ({ messageId: "789", chatId: "123", })); +const sendStickerTelegram = vi.fn(async () => ({ + messageId: "456", + chatId: "123", +})); const deleteMessageTelegram = vi.fn(async () => ({ ok: true })); const originalToken = process.env.TELEGRAM_BOT_TOKEN; vi.mock("../../telegram/send.js", () => ({ reactMessageTelegram: (...args: unknown[]) => reactMessageTelegram(...args), sendMessageTelegram: (...args: unknown[]) => sendMessageTelegram(...args), + sendStickerTelegram: (...args: unknown[]) => sendStickerTelegram(...args), deleteMessageTelegram: (...args: unknown[]) => deleteMessageTelegram(...args), })); @@ -21,6 +26,7 @@ describe("handleTelegramAction", () => { beforeEach(() => { reactMessageTelegram.mockClear(); sendMessageTelegram.mockClear(); + sendStickerTelegram.mockClear(); deleteMessageTelegram.mockClear(); process.env.TELEGRAM_BOT_TOKEN = "tok"; }); @@ -96,6 +102,40 @@ describe("handleTelegramAction", () => { ); }); + it("rejects sticker actions when disabled by default", async () => { + const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig; + await expect( + handleTelegramAction( + { + action: "sendSticker", + to: "123", + fileId: "sticker", + }, + cfg, + ), + ).rejects.toThrow(/sticker actions are disabled/i); + expect(sendStickerTelegram).not.toHaveBeenCalled(); + }); + + it("sends stickers when enabled", async () => { + const cfg = { + channels: { telegram: { botToken: "tok", actions: { sticker: true } } }, + } as ClawdbotConfig; + await handleTelegramAction( + { + action: "sendSticker", + to: "123", + fileId: "sticker", + }, + cfg, + ); + expect(sendStickerTelegram).toHaveBeenCalledWith( + "123", + "sticker", + expect.objectContaining({ token: "tok" }), + ); + }); + it("removes reactions when remove flag set", async () => { const cfg = { channels: { telegram: { botToken: "tok", reactionLevel: "extensive" } }, diff --git a/src/agents/tools/telegram-actions.ts b/src/agents/tools/telegram-actions.ts index 40a97d874..d2a4e4b93 100644 --- a/src/agents/tools/telegram-actions.ts +++ b/src/agents/tools/telegram-actions.ts @@ -258,7 +258,7 @@ export async function handleTelegramAction( } if (action === "sendSticker") { - if (!isActionEnabled("sticker")) { + if (!isActionEnabled("sticker", false)) { throw new Error( "Telegram sticker actions are disabled. Set channels.telegram.actions.sticker to true.", ); @@ -291,7 +291,7 @@ export async function handleTelegramAction( } if (action === "searchSticker") { - if (!isActionEnabled("sticker")) { + if (!isActionEnabled("sticker", false)) { throw new Error( "Telegram sticker actions are disabled. Set channels.telegram.actions.sticker to true.", ); diff --git a/src/channels/plugins/actions/telegram.test.ts b/src/channels/plugins/actions/telegram.test.ts index b2673134d..e61a73908 100644 --- a/src/channels/plugins/actions/telegram.test.ts +++ b/src/channels/plugins/actions/telegram.test.ts @@ -10,6 +10,13 @@ vi.mock("../../../agents/tools/telegram-actions.js", () => ({ })); describe("telegramMessageActions", () => { + it("excludes sticker actions when not enabled", () => { + const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig; + const actions = telegramMessageActions.listActions({ cfg }); + expect(actions).not.toContain("sticker"); + expect(actions).not.toContain("sticker-search"); + }); + it("allows media-only sends and passes asVoice", async () => { handleTelegramAction.mockClear(); const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig; diff --git a/src/channels/plugins/actions/telegram.ts b/src/channels/plugins/actions/telegram.ts index f8c7dc0fb..2acfaf9f1 100644 --- a/src/channels/plugins/actions/telegram.ts +++ b/src/channels/plugins/actions/telegram.ts @@ -46,7 +46,7 @@ export const telegramMessageActions: ChannelMessageActionAdapter = { if (gate("reactions")) actions.add("react"); if (gate("deleteMessage")) actions.add("delete"); if (gate("editMessage")) actions.add("edit"); - if (gate("sticker")) { + if (gate("sticker", false)) { actions.add("sticker"); actions.add("sticker-search"); } diff --git a/src/media-understanding/runner.ts b/src/media-understanding/runner.ts index 9e92d67c0..36636c542 100644 --- a/src/media-understanding/runner.ts +++ b/src/media-understanding/runner.ts @@ -412,6 +412,39 @@ async function resolveAutoEntries(params: { return []; } +export async function resolveAutoImageModel(params: { + cfg: ClawdbotConfig; + agentDir?: string; + activeModel?: ActiveMediaModel; +}): Promise { + const providerRegistry = buildProviderRegistry(); + const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => { + if (!entry || entry.type === "cli") return null; + const provider = entry.provider; + if (!provider) return null; + const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider]; + if (!model) return null; + return { provider, model }; + }; + const activeEntry = await resolveActiveModelEntry({ + cfg: params.cfg, + agentDir: params.agentDir, + providerRegistry, + capability: "image", + activeModel: params.activeModel, + }); + const resolvedActive = toActive(activeEntry); + if (resolvedActive) return resolvedActive; + const keyEntry = await resolveKeyEntry({ + cfg: params.cfg, + agentDir: params.agentDir, + providerRegistry, + capability: "image", + activeModel: params.activeModel, + }); + return toActive(keyEntry); +} + async function resolveActiveModelEntry(params: { cfg: ClawdbotConfig; agentDir?: string; diff --git a/src/telegram/bot-message-dispatch.ts b/src/telegram/bot-message-dispatch.ts index e24796d6c..a3e9c3faa 100644 --- a/src/telegram/bot-message-dispatch.ts +++ b/src/telegram/bot-message-dispatch.ts @@ -139,6 +139,7 @@ export const dispatchTelegramMessage = async ({ imagePath: ctxPayload.MediaPath, cfg, agentDir, + agentId: route.agentId, }); if (description) { // Format the description with sticker context diff --git a/src/telegram/bot.media.downloads-media-file-path-no-file-download.test.ts b/src/telegram/bot.media.downloads-media-file-path-no-file-download.test.ts index dd75e6798..165488426 100644 --- a/src/telegram/bot.media.downloads-media-file-path-no-file-download.test.ts +++ b/src/telegram/bot.media.downloads-media-file-path-no-file-download.test.ts @@ -7,6 +7,9 @@ const middlewareUseSpy = vi.fn(); const onSpy = vi.fn(); const stopSpy = vi.fn(); const sendChatActionSpy = vi.fn(); +const cacheStickerSpy = vi.fn(); +const getCachedStickerSpy = vi.fn(); +const describeStickerImageSpy = vi.fn(); type ApiStub = { config: { use: (arg: unknown) => void }; @@ -79,6 +82,12 @@ vi.mock("../config/sessions.js", async (importOriginal) => { }; }); +vi.mock("./sticker-cache.js", () => ({ + cacheSticker: (...args: unknown[]) => cacheStickerSpy(...args), + getCachedSticker: (...args: unknown[]) => getCachedStickerSpy(...args), + describeStickerImage: (...args: unknown[]) => describeStickerImageSpy(...args), +})); + vi.mock("./pairing-store.js", () => ({ readTelegramAllowFromStore: vi.fn(async () => [] as string[]), upsertTelegramPairingRequest: vi.fn(async () => ({ @@ -408,6 +417,12 @@ describe("telegram media groups", () => { describe("telegram stickers", () => { const STICKER_TEST_TIMEOUT_MS = process.platform === "win32" ? 30_000 : 20_000; + beforeEach(() => { + cacheStickerSpy.mockReset(); + getCachedStickerSpy.mockReset(); + describeStickerImageSpy.mockReset(); + }); + it( "downloads static sticker (WEBP) and includes sticker metadata", async () => { @@ -481,6 +496,88 @@ describe("telegram stickers", () => { STICKER_TEST_TIMEOUT_MS, ); + it( + "refreshes cached sticker metadata on cache hit", + async () => { + const { createTelegramBot } = await import("./bot.js"); + const replyModule = await import("../auto-reply/reply.js"); + const replySpy = replyModule.__replySpy as unknown as ReturnType; + + onSpy.mockReset(); + replySpy.mockReset(); + sendChatActionSpy.mockReset(); + + getCachedStickerSpy.mockReturnValue({ + fileId: "old_file_id", + fileUniqueId: "sticker_unique_456", + emoji: "😴", + setName: "OldSet", + description: "Cached description", + cachedAt: "2026-01-20T10:00:00.000Z", + }); + + const runtimeError = vi.fn(); + createTelegramBot({ + token: "tok", + runtime: { + log: vi.fn(), + error: runtimeError, + exit: () => { + throw new Error("exit"); + }, + }, + }); + const handler = onSpy.mock.calls.find((call) => call[0] === "message")?.[1] as ( + ctx: Record, + ) => Promise; + expect(handler).toBeDefined(); + + const fetchSpy = vi.spyOn(globalThis, "fetch" as never).mockResolvedValueOnce({ + ok: true, + status: 200, + statusText: "OK", + headers: { get: () => "image/webp" }, + arrayBuffer: async () => new Uint8Array([0x52, 0x49, 0x46, 0x46]).buffer, + } as Response); + + await handler({ + message: { + message_id: 103, + chat: { id: 1234, type: "private" }, + sticker: { + file_id: "new_file_id", + file_unique_id: "sticker_unique_456", + type: "regular", + width: 512, + height: 512, + is_animated: false, + is_video: false, + emoji: "🔥", + set_name: "NewSet", + }, + date: 1736380800, + }, + me: { username: "clawdbot_bot" }, + getFile: async () => ({ file_path: "stickers/sticker.webp" }), + }); + + expect(runtimeError).not.toHaveBeenCalled(); + expect(cacheStickerSpy).toHaveBeenCalledWith( + expect.objectContaining({ + fileId: "new_file_id", + emoji: "🔥", + setName: "NewSet", + }), + ); + const payload = replySpy.mock.calls[0][0]; + expect(payload.Sticker?.fileId).toBe("new_file_id"); + expect(payload.Sticker?.cachedDescription).toBe("Cached description"); + + fetchSpy.mockRestore(); + }, + STICKER_TEST_TIMEOUT_MS, + ); + it( "skips animated stickers (TGS format)", async () => { diff --git a/src/telegram/bot/delivery.ts b/src/telegram/bot/delivery.ts index f950417c7..779c0c026 100644 --- a/src/telegram/bot/delivery.ts +++ b/src/telegram/bot/delivery.ts @@ -22,7 +22,7 @@ import { buildInlineKeyboard } from "../send.js"; import { resolveTelegramVoiceSend } from "../voice.js"; import { buildTelegramThreadParams, resolveTelegramReplyId } from "./helpers.js"; import type { StickerMetadata, TelegramContext } from "./types.js"; -import { getCachedSticker } from "../sticker-cache.js"; +import { cacheSticker, getCachedSticker } from "../sticker-cache.js"; const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i; const VOICE_FORBIDDEN_RE = /VOICE_MESSAGES_FORBIDDEN/; @@ -303,14 +303,26 @@ export async function resolveMedia( const cached = sticker.file_unique_id ? getCachedSticker(sticker.file_unique_id) : null; if (cached) { logVerbose(`telegram: sticker cache hit for ${sticker.file_unique_id}`); + const fileId = sticker.file_id ?? cached.fileId; + const emoji = sticker.emoji ?? cached.emoji; + const setName = sticker.set_name ?? cached.setName; + if (fileId !== cached.fileId || emoji !== cached.emoji || setName !== cached.setName) { + // Refresh cached sticker metadata on hits so sends/searches use latest file_id. + cacheSticker({ + ...cached, + fileId, + emoji, + setName, + }); + } return { path: saved.path, contentType: saved.contentType, placeholder: "", stickerMetadata: { - emoji: cached.emoji, - setName: cached.setName, - fileId: cached.fileId, + emoji, + setName, + fileId, fileUniqueId: sticker.file_unique_id, cachedDescription: cached.description, }, @@ -330,7 +342,7 @@ export async function resolveMedia( }, }; } catch (err) { - logVerbose(`telegram: failed to process sticker: ${err}`); + logVerbose(`telegram: failed to process sticker: ${String(err)}`); return null; } } diff --git a/src/telegram/sticker-cache.ts b/src/telegram/sticker-cache.ts index 2c55563b7..38f421851 100644 --- a/src/telegram/sticker-cache.ts +++ b/src/telegram/sticker-cache.ts @@ -4,7 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js"; import { STATE_DIR_CLAWDBOT } from "../config/paths.js"; import { loadJsonFile, saveJsonFile } from "../infra/json-file.js"; import { logVerbose } from "../globals.js"; -import { resolveApiKeyForProvider } from "../agents/model-auth.js"; +import { + findModelInCatalog, + loadModelCatalog, + modelSupportsVision, +} from "../agents/model-catalog.js"; +import { resolveDefaultModelForAgent } from "../agents/model-selection.js"; +import { resolveAutoImageModel } from "../media-understanding/runner.js"; const CACHE_FILE = path.join(STATE_DIR_CLAWDBOT, "telegram", "sticker-cache.json"); const CACHE_VERSION = 1; @@ -135,18 +141,11 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?: const STICKER_DESCRIPTION_PROMPT = "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective."; -const VISION_PROVIDERS = ["anthropic", "openai", "google", "minimax"] as const; -const DEFAULT_VISION_MODELS: Record = { - anthropic: "claude-sonnet-4-20250514", - openai: "gpt-4o-mini", - google: "gemini-2.0-flash", - minimax: "MiniMax-VL-01", -}; - export interface DescribeStickerParams { imagePath: string; cfg: ClawdbotConfig; agentDir?: string; + agentId?: string; } /** @@ -155,26 +154,35 @@ export interface DescribeStickerParams { * Returns null if no vision provider is available. */ export async function describeStickerImage(params: DescribeStickerParams): Promise { - const { imagePath, cfg, agentDir } = params; + const { imagePath, cfg, agentDir, agentId } = params; - // Find a vision provider with available API key - let provider: string | null = null; - for (const p of VISION_PROVIDERS) { - try { - await resolveApiKeyForProvider({ provider: p, cfg, agentDir }); - provider = p; - break; - } catch { - // No key for this provider, try next + const defaultModel = resolveDefaultModelForAgent({ cfg, agentId }); + let activeModel = undefined as { provider: string; model: string } | undefined; + try { + const catalog = await loadModelCatalog({ config: cfg }); + const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model); + if (modelSupportsVision(entry)) { + activeModel = { provider: defaultModel.provider, model: defaultModel.model }; } + } catch { + // Ignore catalog failures; fall back to auto selection. } - if (!provider) { + const resolved = await resolveAutoImageModel({ + cfg, + agentDir, + activeModel, + }); + if (!resolved) { logVerbose("telegram: no vision provider available for sticker description"); return null; } - const model = DEFAULT_VISION_MODELS[provider]; + const { provider, model } = resolved; + if (!model) { + logVerbose(`telegram: no vision model available for ${provider}`); + return null; + } logVerbose(`telegram: describing sticker with ${provider}/${model}`); try { @@ -195,7 +203,7 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi }); return result.text; } catch (err) { - logVerbose(`telegram: failed to describe sticker: ${err}`); + logVerbose(`telegram: failed to describe sticker: ${String(err)}`); return null; } }