fix(telegram): improve sticker vision + cache (#2548) (thanks @longjos)

This commit is contained in:
Ayaan Zaidi
2026-01-27 12:47:04 +05:30
committed by Ayaan Zaidi
parent 506bed5aed
commit 34fea720f8
11 changed files with 240 additions and 37 deletions

View File

@@ -53,6 +53,7 @@ Status: unreleased.
- Telegram: keep topic IDs in restart sentinel notifications. (#1807) Thanks @hsrvc.
- Telegram: add optional silent send flag (disable notifications). (#2382) Thanks @Suksham-sharma.
- Telegram: support editing sent messages via message(action="edit"). (#2394) Thanks @marcelomar21.
- Telegram: add sticker receive/send with vision caching. (#2548) Thanks @longjos.
- Config: apply config.env before ${VAR} substitution. (#1813) Thanks @spanishflu-est1918.
- Slack: clear ack reaction after streamed replies. (#2044) Thanks @fancyboi999.
- macOS: keep custom SSH usernames in remote target. (#2046) Thanks @algal.

View File

@@ -395,10 +395,13 @@ When a user sends a sticker, Clawdbot handles it based on the sticker type:
- **Animated stickers (TGS):** Skipped (Lottie format not supported for processing).
- **Video stickers (WEBM):** Skipped (video format not supported for processing).
Template context fields available when receiving stickers:
- `StickerEmoji` — the emoji associated with the sticker
- `StickerSetName` — the name of the sticker set
- `StickerFileId` — the Telegram file ID (used for sending the same sticker back)
Template context field available when receiving stickers:
- `Sticker` — object with:
- `emoji` — emoji associated with the sticker
- `setName` — name of the sticker set
- `fileId` — Telegram file ID (send the same sticker back)
- `fileUniqueId` — stable ID for cache lookup
- `cachedDescription` — cached vision description when available
### Sticker cache
@@ -416,10 +419,11 @@ Stickers are processed through the AI's vision capabilities to generate descript
```json
{
"fileId": "CAACAgIAAxkBAAI...",
"fileUniqueId": "AgADBAADb6cxG2Y",
"emoji": "👋",
"setName": "CoolCats",
"description": "A cartoon cat waving enthusiastically",
"addedAt": "2026-01-15T10:30:00.000Z"
"cachedAt": "2026-01-15T10:30:00.000Z"
}
```
@@ -458,7 +462,7 @@ The agent can send and search stickers using the `sticker` and `sticker-search`
```
Parameters:
- `fileId` (required) — the Telegram file ID of the sticker. Obtain this from `StickerFileId` when receiving a sticker, or from a `sticker-search` result.
- `fileId` (required) — the Telegram file ID of the sticker. Obtain this from `Sticker.fileId` when receiving a sticker, or from a `sticker-search` result.
- `replyTo` (optional) — message ID to reply to.
- `threadId` (optional) — message thread ID for forum topics.
@@ -543,7 +547,7 @@ Outbound Telegram API calls retry on transient network/429 errors with exponenti
- Tool: `telegram` with `react` action (`chatId`, `messageId`, `emoji`).
- Tool: `telegram` with `deleteMessage` action (`chatId`, `messageId`).
- Reaction removal semantics: see [/tools/reactions](/tools/reactions).
- Tool gating: `channels.telegram.actions.reactions`, `channels.telegram.actions.sendMessage`, `channels.telegram.actions.deleteMessage` (default: enabled).
- Tool gating: `channels.telegram.actions.reactions`, `channels.telegram.actions.sendMessage`, `channels.telegram.actions.deleteMessage` (default: enabled), and `channels.telegram.actions.sticker` (default: disabled).
## Reaction notifications

View File

@@ -8,12 +8,17 @@ const sendMessageTelegram = vi.fn(async () => ({
messageId: "789",
chatId: "123",
}));
const sendStickerTelegram = vi.fn(async () => ({
messageId: "456",
chatId: "123",
}));
const deleteMessageTelegram = vi.fn(async () => ({ ok: true }));
const originalToken = process.env.TELEGRAM_BOT_TOKEN;
vi.mock("../../telegram/send.js", () => ({
reactMessageTelegram: (...args: unknown[]) => reactMessageTelegram(...args),
sendMessageTelegram: (...args: unknown[]) => sendMessageTelegram(...args),
sendStickerTelegram: (...args: unknown[]) => sendStickerTelegram(...args),
deleteMessageTelegram: (...args: unknown[]) => deleteMessageTelegram(...args),
}));
@@ -21,6 +26,7 @@ describe("handleTelegramAction", () => {
beforeEach(() => {
reactMessageTelegram.mockClear();
sendMessageTelegram.mockClear();
sendStickerTelegram.mockClear();
deleteMessageTelegram.mockClear();
process.env.TELEGRAM_BOT_TOKEN = "tok";
});
@@ -96,6 +102,40 @@ describe("handleTelegramAction", () => {
);
});
it("rejects sticker actions when disabled by default", async () => {
const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig;
await expect(
handleTelegramAction(
{
action: "sendSticker",
to: "123",
fileId: "sticker",
},
cfg,
),
).rejects.toThrow(/sticker actions are disabled/i);
expect(sendStickerTelegram).not.toHaveBeenCalled();
});
it("sends stickers when enabled", async () => {
const cfg = {
channels: { telegram: { botToken: "tok", actions: { sticker: true } } },
} as ClawdbotConfig;
await handleTelegramAction(
{
action: "sendSticker",
to: "123",
fileId: "sticker",
},
cfg,
);
expect(sendStickerTelegram).toHaveBeenCalledWith(
"123",
"sticker",
expect.objectContaining({ token: "tok" }),
);
});
it("removes reactions when remove flag set", async () => {
const cfg = {
channels: { telegram: { botToken: "tok", reactionLevel: "extensive" } },

View File

@@ -258,7 +258,7 @@ export async function handleTelegramAction(
}
if (action === "sendSticker") {
if (!isActionEnabled("sticker")) {
if (!isActionEnabled("sticker", false)) {
throw new Error(
"Telegram sticker actions are disabled. Set channels.telegram.actions.sticker to true.",
);
@@ -291,7 +291,7 @@ export async function handleTelegramAction(
}
if (action === "searchSticker") {
if (!isActionEnabled("sticker")) {
if (!isActionEnabled("sticker", false)) {
throw new Error(
"Telegram sticker actions are disabled. Set channels.telegram.actions.sticker to true.",
);

View File

@@ -10,6 +10,13 @@ vi.mock("../../../agents/tools/telegram-actions.js", () => ({
}));
describe("telegramMessageActions", () => {
it("excludes sticker actions when not enabled", () => {
const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig;
const actions = telegramMessageActions.listActions({ cfg });
expect(actions).not.toContain("sticker");
expect(actions).not.toContain("sticker-search");
});
it("allows media-only sends and passes asVoice", async () => {
handleTelegramAction.mockClear();
const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig;

View File

@@ -46,7 +46,7 @@ export const telegramMessageActions: ChannelMessageActionAdapter = {
if (gate("reactions")) actions.add("react");
if (gate("deleteMessage")) actions.add("delete");
if (gate("editMessage")) actions.add("edit");
if (gate("sticker")) {
if (gate("sticker", false)) {
actions.add("sticker");
actions.add("sticker-search");
}

View File

@@ -412,6 +412,39 @@ async function resolveAutoEntries(params: {
return [];
}
export async function resolveAutoImageModel(params: {
cfg: ClawdbotConfig;
agentDir?: string;
activeModel?: ActiveMediaModel;
}): Promise<ActiveMediaModel | null> {
const providerRegistry = buildProviderRegistry();
const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
if (!entry || entry.type === "cli") return null;
const provider = entry.provider;
if (!provider) return null;
const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider];
if (!model) return null;
return { provider, model };
};
const activeEntry = await resolveActiveModelEntry({
cfg: params.cfg,
agentDir: params.agentDir,
providerRegistry,
capability: "image",
activeModel: params.activeModel,
});
const resolvedActive = toActive(activeEntry);
if (resolvedActive) return resolvedActive;
const keyEntry = await resolveKeyEntry({
cfg: params.cfg,
agentDir: params.agentDir,
providerRegistry,
capability: "image",
activeModel: params.activeModel,
});
return toActive(keyEntry);
}
async function resolveActiveModelEntry(params: {
cfg: ClawdbotConfig;
agentDir?: string;

View File

@@ -139,6 +139,7 @@ export const dispatchTelegramMessage = async ({
imagePath: ctxPayload.MediaPath,
cfg,
agentDir,
agentId: route.agentId,
});
if (description) {
// Format the description with sticker context

View File

@@ -7,6 +7,9 @@ const middlewareUseSpy = vi.fn();
const onSpy = vi.fn();
const stopSpy = vi.fn();
const sendChatActionSpy = vi.fn();
const cacheStickerSpy = vi.fn();
const getCachedStickerSpy = vi.fn();
const describeStickerImageSpy = vi.fn();
type ApiStub = {
config: { use: (arg: unknown) => void };
@@ -79,6 +82,12 @@ vi.mock("../config/sessions.js", async (importOriginal) => {
};
});
vi.mock("./sticker-cache.js", () => ({
cacheSticker: (...args: unknown[]) => cacheStickerSpy(...args),
getCachedSticker: (...args: unknown[]) => getCachedStickerSpy(...args),
describeStickerImage: (...args: unknown[]) => describeStickerImageSpy(...args),
}));
vi.mock("./pairing-store.js", () => ({
readTelegramAllowFromStore: vi.fn(async () => [] as string[]),
upsertTelegramPairingRequest: vi.fn(async () => ({
@@ -408,6 +417,12 @@ describe("telegram media groups", () => {
describe("telegram stickers", () => {
const STICKER_TEST_TIMEOUT_MS = process.platform === "win32" ? 30_000 : 20_000;
beforeEach(() => {
cacheStickerSpy.mockReset();
getCachedStickerSpy.mockReset();
describeStickerImageSpy.mockReset();
});
it(
"downloads static sticker (WEBP) and includes sticker metadata",
async () => {
@@ -481,6 +496,88 @@ describe("telegram stickers", () => {
STICKER_TEST_TIMEOUT_MS,
);
it(
"refreshes cached sticker metadata on cache hit",
async () => {
const { createTelegramBot } = await import("./bot.js");
const replyModule = await import("../auto-reply/reply.js");
const replySpy = replyModule.__replySpy as unknown as ReturnType<typeof vi.fn>;
onSpy.mockReset();
replySpy.mockReset();
sendChatActionSpy.mockReset();
getCachedStickerSpy.mockReturnValue({
fileId: "old_file_id",
fileUniqueId: "sticker_unique_456",
emoji: "😴",
setName: "OldSet",
description: "Cached description",
cachedAt: "2026-01-20T10:00:00.000Z",
});
const runtimeError = vi.fn();
createTelegramBot({
token: "tok",
runtime: {
log: vi.fn(),
error: runtimeError,
exit: () => {
throw new Error("exit");
},
},
});
const handler = onSpy.mock.calls.find((call) => call[0] === "message")?.[1] as (
ctx: Record<string, unknown>,
) => Promise<void>;
expect(handler).toBeDefined();
const fetchSpy = vi.spyOn(globalThis, "fetch" as never).mockResolvedValueOnce({
ok: true,
status: 200,
statusText: "OK",
headers: { get: () => "image/webp" },
arrayBuffer: async () => new Uint8Array([0x52, 0x49, 0x46, 0x46]).buffer,
} as Response);
await handler({
message: {
message_id: 103,
chat: { id: 1234, type: "private" },
sticker: {
file_id: "new_file_id",
file_unique_id: "sticker_unique_456",
type: "regular",
width: 512,
height: 512,
is_animated: false,
is_video: false,
emoji: "🔥",
set_name: "NewSet",
},
date: 1736380800,
},
me: { username: "clawdbot_bot" },
getFile: async () => ({ file_path: "stickers/sticker.webp" }),
});
expect(runtimeError).not.toHaveBeenCalled();
expect(cacheStickerSpy).toHaveBeenCalledWith(
expect.objectContaining({
fileId: "new_file_id",
emoji: "🔥",
setName: "NewSet",
}),
);
const payload = replySpy.mock.calls[0][0];
expect(payload.Sticker?.fileId).toBe("new_file_id");
expect(payload.Sticker?.cachedDescription).toBe("Cached description");
fetchSpy.mockRestore();
},
STICKER_TEST_TIMEOUT_MS,
);
it(
"skips animated stickers (TGS format)",
async () => {

View File

@@ -22,7 +22,7 @@ import { buildInlineKeyboard } from "../send.js";
import { resolveTelegramVoiceSend } from "../voice.js";
import { buildTelegramThreadParams, resolveTelegramReplyId } from "./helpers.js";
import type { StickerMetadata, TelegramContext } from "./types.js";
import { getCachedSticker } from "../sticker-cache.js";
import { cacheSticker, getCachedSticker } from "../sticker-cache.js";
const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i;
const VOICE_FORBIDDEN_RE = /VOICE_MESSAGES_FORBIDDEN/;
@@ -303,14 +303,26 @@ export async function resolveMedia(
const cached = sticker.file_unique_id ? getCachedSticker(sticker.file_unique_id) : null;
if (cached) {
logVerbose(`telegram: sticker cache hit for ${sticker.file_unique_id}`);
const fileId = sticker.file_id ?? cached.fileId;
const emoji = sticker.emoji ?? cached.emoji;
const setName = sticker.set_name ?? cached.setName;
if (fileId !== cached.fileId || emoji !== cached.emoji || setName !== cached.setName) {
// Refresh cached sticker metadata on hits so sends/searches use latest file_id.
cacheSticker({
...cached,
fileId,
emoji,
setName,
});
}
return {
path: saved.path,
contentType: saved.contentType,
placeholder: "<media:sticker>",
stickerMetadata: {
emoji: cached.emoji,
setName: cached.setName,
fileId: cached.fileId,
emoji,
setName,
fileId,
fileUniqueId: sticker.file_unique_id,
cachedDescription: cached.description,
},
@@ -330,7 +342,7 @@ export async function resolveMedia(
},
};
} catch (err) {
logVerbose(`telegram: failed to process sticker: ${err}`);
logVerbose(`telegram: failed to process sticker: ${String(err)}`);
return null;
}
}

View File

@@ -4,7 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js";
import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
import { logVerbose } from "../globals.js";
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
import {
findModelInCatalog,
loadModelCatalog,
modelSupportsVision,
} from "../agents/model-catalog.js";
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
import { resolveAutoImageModel } from "../media-understanding/runner.js";
const CACHE_FILE = path.join(STATE_DIR_CLAWDBOT, "telegram", "sticker-cache.json");
const CACHE_VERSION = 1;
@@ -135,18 +141,11 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?:
const STICKER_DESCRIPTION_PROMPT =
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
const VISION_PROVIDERS = ["anthropic", "openai", "google", "minimax"] as const;
const DEFAULT_VISION_MODELS: Record<string, string> = {
anthropic: "claude-sonnet-4-20250514",
openai: "gpt-4o-mini",
google: "gemini-2.0-flash",
minimax: "MiniMax-VL-01",
};
export interface DescribeStickerParams {
imagePath: string;
cfg: ClawdbotConfig;
agentDir?: string;
agentId?: string;
}
/**
@@ -155,26 +154,35 @@ export interface DescribeStickerParams {
* Returns null if no vision provider is available.
*/
export async function describeStickerImage(params: DescribeStickerParams): Promise<string | null> {
const { imagePath, cfg, agentDir } = params;
const { imagePath, cfg, agentDir, agentId } = params;
// Find a vision provider with available API key
let provider: string | null = null;
for (const p of VISION_PROVIDERS) {
try {
await resolveApiKeyForProvider({ provider: p, cfg, agentDir });
provider = p;
break;
} catch {
// No key for this provider, try next
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
let activeModel = undefined as { provider: string; model: string } | undefined;
try {
const catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
if (modelSupportsVision(entry)) {
activeModel = { provider: defaultModel.provider, model: defaultModel.model };
}
} catch {
// Ignore catalog failures; fall back to auto selection.
}
if (!provider) {
const resolved = await resolveAutoImageModel({
cfg,
agentDir,
activeModel,
});
if (!resolved) {
logVerbose("telegram: no vision provider available for sticker description");
return null;
}
const model = DEFAULT_VISION_MODELS[provider];
const { provider, model } = resolved;
if (!model) {
logVerbose(`telegram: no vision model available for ${provider}`);
return null;
}
logVerbose(`telegram: describing sticker with ${provider}/${model}`);
try {
@@ -195,7 +203,7 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
});
return result.text;
} catch (err) {
logVerbose(`telegram: failed to describe sticker: ${err}`);
logVerbose(`telegram: failed to describe sticker: ${String(err)}`);
return null;
}
}