fix(telegram): improve sticker vision + cache (#2548) (thanks @longjos)
This commit is contained in:
@@ -53,6 +53,7 @@ Status: unreleased.
|
||||
- Telegram: keep topic IDs in restart sentinel notifications. (#1807) Thanks @hsrvc.
|
||||
- Telegram: add optional silent send flag (disable notifications). (#2382) Thanks @Suksham-sharma.
|
||||
- Telegram: support editing sent messages via message(action="edit"). (#2394) Thanks @marcelomar21.
|
||||
- Telegram: add sticker receive/send with vision caching. (#2548) Thanks @longjos.
|
||||
- Config: apply config.env before ${VAR} substitution. (#1813) Thanks @spanishflu-est1918.
|
||||
- Slack: clear ack reaction after streamed replies. (#2044) Thanks @fancyboi999.
|
||||
- macOS: keep custom SSH usernames in remote target. (#2046) Thanks @algal.
|
||||
|
||||
@@ -395,10 +395,13 @@ When a user sends a sticker, Clawdbot handles it based on the sticker type:
|
||||
- **Animated stickers (TGS):** Skipped (Lottie format not supported for processing).
|
||||
- **Video stickers (WEBM):** Skipped (video format not supported for processing).
|
||||
|
||||
Template context fields available when receiving stickers:
|
||||
- `StickerEmoji` — the emoji associated with the sticker
|
||||
- `StickerSetName` — the name of the sticker set
|
||||
- `StickerFileId` — the Telegram file ID (used for sending the same sticker back)
|
||||
Template context field available when receiving stickers:
|
||||
- `Sticker` — object with:
|
||||
- `emoji` — emoji associated with the sticker
|
||||
- `setName` — name of the sticker set
|
||||
- `fileId` — Telegram file ID (send the same sticker back)
|
||||
- `fileUniqueId` — stable ID for cache lookup
|
||||
- `cachedDescription` — cached vision description when available
|
||||
|
||||
### Sticker cache
|
||||
|
||||
@@ -416,10 +419,11 @@ Stickers are processed through the AI's vision capabilities to generate descript
|
||||
```json
|
||||
{
|
||||
"fileId": "CAACAgIAAxkBAAI...",
|
||||
"fileUniqueId": "AgADBAADb6cxG2Y",
|
||||
"emoji": "👋",
|
||||
"setName": "CoolCats",
|
||||
"description": "A cartoon cat waving enthusiastically",
|
||||
"addedAt": "2026-01-15T10:30:00.000Z"
|
||||
"cachedAt": "2026-01-15T10:30:00.000Z"
|
||||
}
|
||||
```
|
||||
|
||||
@@ -458,7 +462,7 @@ The agent can send and search stickers using the `sticker` and `sticker-search`
|
||||
```
|
||||
|
||||
Parameters:
|
||||
- `fileId` (required) — the Telegram file ID of the sticker. Obtain this from `StickerFileId` when receiving a sticker, or from a `sticker-search` result.
|
||||
- `fileId` (required) — the Telegram file ID of the sticker. Obtain this from `Sticker.fileId` when receiving a sticker, or from a `sticker-search` result.
|
||||
- `replyTo` (optional) — message ID to reply to.
|
||||
- `threadId` (optional) — message thread ID for forum topics.
|
||||
|
||||
@@ -543,7 +547,7 @@ Outbound Telegram API calls retry on transient network/429 errors with exponenti
|
||||
- Tool: `telegram` with `react` action (`chatId`, `messageId`, `emoji`).
|
||||
- Tool: `telegram` with `deleteMessage` action (`chatId`, `messageId`).
|
||||
- Reaction removal semantics: see [/tools/reactions](/tools/reactions).
|
||||
- Tool gating: `channels.telegram.actions.reactions`, `channels.telegram.actions.sendMessage`, `channels.telegram.actions.deleteMessage` (default: enabled).
|
||||
- Tool gating: `channels.telegram.actions.reactions`, `channels.telegram.actions.sendMessage`, `channels.telegram.actions.deleteMessage` (default: enabled), and `channels.telegram.actions.sticker` (default: disabled).
|
||||
|
||||
## Reaction notifications
|
||||
|
||||
|
||||
@@ -8,12 +8,17 @@ const sendMessageTelegram = vi.fn(async () => ({
|
||||
messageId: "789",
|
||||
chatId: "123",
|
||||
}));
|
||||
const sendStickerTelegram = vi.fn(async () => ({
|
||||
messageId: "456",
|
||||
chatId: "123",
|
||||
}));
|
||||
const deleteMessageTelegram = vi.fn(async () => ({ ok: true }));
|
||||
const originalToken = process.env.TELEGRAM_BOT_TOKEN;
|
||||
|
||||
vi.mock("../../telegram/send.js", () => ({
|
||||
reactMessageTelegram: (...args: unknown[]) => reactMessageTelegram(...args),
|
||||
sendMessageTelegram: (...args: unknown[]) => sendMessageTelegram(...args),
|
||||
sendStickerTelegram: (...args: unknown[]) => sendStickerTelegram(...args),
|
||||
deleteMessageTelegram: (...args: unknown[]) => deleteMessageTelegram(...args),
|
||||
}));
|
||||
|
||||
@@ -21,6 +26,7 @@ describe("handleTelegramAction", () => {
|
||||
beforeEach(() => {
|
||||
reactMessageTelegram.mockClear();
|
||||
sendMessageTelegram.mockClear();
|
||||
sendStickerTelegram.mockClear();
|
||||
deleteMessageTelegram.mockClear();
|
||||
process.env.TELEGRAM_BOT_TOKEN = "tok";
|
||||
});
|
||||
@@ -96,6 +102,40 @@ describe("handleTelegramAction", () => {
|
||||
);
|
||||
});
|
||||
|
||||
it("rejects sticker actions when disabled by default", async () => {
|
||||
const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig;
|
||||
await expect(
|
||||
handleTelegramAction(
|
||||
{
|
||||
action: "sendSticker",
|
||||
to: "123",
|
||||
fileId: "sticker",
|
||||
},
|
||||
cfg,
|
||||
),
|
||||
).rejects.toThrow(/sticker actions are disabled/i);
|
||||
expect(sendStickerTelegram).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("sends stickers when enabled", async () => {
|
||||
const cfg = {
|
||||
channels: { telegram: { botToken: "tok", actions: { sticker: true } } },
|
||||
} as ClawdbotConfig;
|
||||
await handleTelegramAction(
|
||||
{
|
||||
action: "sendSticker",
|
||||
to: "123",
|
||||
fileId: "sticker",
|
||||
},
|
||||
cfg,
|
||||
);
|
||||
expect(sendStickerTelegram).toHaveBeenCalledWith(
|
||||
"123",
|
||||
"sticker",
|
||||
expect.objectContaining({ token: "tok" }),
|
||||
);
|
||||
});
|
||||
|
||||
it("removes reactions when remove flag set", async () => {
|
||||
const cfg = {
|
||||
channels: { telegram: { botToken: "tok", reactionLevel: "extensive" } },
|
||||
|
||||
@@ -258,7 +258,7 @@ export async function handleTelegramAction(
|
||||
}
|
||||
|
||||
if (action === "sendSticker") {
|
||||
if (!isActionEnabled("sticker")) {
|
||||
if (!isActionEnabled("sticker", false)) {
|
||||
throw new Error(
|
||||
"Telegram sticker actions are disabled. Set channels.telegram.actions.sticker to true.",
|
||||
);
|
||||
@@ -291,7 +291,7 @@ export async function handleTelegramAction(
|
||||
}
|
||||
|
||||
if (action === "searchSticker") {
|
||||
if (!isActionEnabled("sticker")) {
|
||||
if (!isActionEnabled("sticker", false)) {
|
||||
throw new Error(
|
||||
"Telegram sticker actions are disabled. Set channels.telegram.actions.sticker to true.",
|
||||
);
|
||||
|
||||
@@ -10,6 +10,13 @@ vi.mock("../../../agents/tools/telegram-actions.js", () => ({
|
||||
}));
|
||||
|
||||
describe("telegramMessageActions", () => {
|
||||
it("excludes sticker actions when not enabled", () => {
|
||||
const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig;
|
||||
const actions = telegramMessageActions.listActions({ cfg });
|
||||
expect(actions).not.toContain("sticker");
|
||||
expect(actions).not.toContain("sticker-search");
|
||||
});
|
||||
|
||||
it("allows media-only sends and passes asVoice", async () => {
|
||||
handleTelegramAction.mockClear();
|
||||
const cfg = { channels: { telegram: { botToken: "tok" } } } as ClawdbotConfig;
|
||||
|
||||
@@ -46,7 +46,7 @@ export const telegramMessageActions: ChannelMessageActionAdapter = {
|
||||
if (gate("reactions")) actions.add("react");
|
||||
if (gate("deleteMessage")) actions.add("delete");
|
||||
if (gate("editMessage")) actions.add("edit");
|
||||
if (gate("sticker")) {
|
||||
if (gate("sticker", false)) {
|
||||
actions.add("sticker");
|
||||
actions.add("sticker-search");
|
||||
}
|
||||
|
||||
@@ -412,6 +412,39 @@ async function resolveAutoEntries(params: {
|
||||
return [];
|
||||
}
|
||||
|
||||
export async function resolveAutoImageModel(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<ActiveMediaModel | null> {
|
||||
const providerRegistry = buildProviderRegistry();
|
||||
const toActive = (entry: MediaUnderstandingModelConfig | null): ActiveMediaModel | null => {
|
||||
if (!entry || entry.type === "cli") return null;
|
||||
const provider = entry.provider;
|
||||
if (!provider) return null;
|
||||
const model = entry.model ?? DEFAULT_IMAGE_MODELS[provider];
|
||||
if (!model) return null;
|
||||
return { provider, model };
|
||||
};
|
||||
const activeEntry = await resolveActiveModelEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
capability: "image",
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
const resolvedActive = toActive(activeEntry);
|
||||
if (resolvedActive) return resolvedActive;
|
||||
const keyEntry = await resolveKeyEntry({
|
||||
cfg: params.cfg,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry,
|
||||
capability: "image",
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
return toActive(keyEntry);
|
||||
}
|
||||
|
||||
async function resolveActiveModelEntry(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
|
||||
@@ -139,6 +139,7 @@ export const dispatchTelegramMessage = async ({
|
||||
imagePath: ctxPayload.MediaPath,
|
||||
cfg,
|
||||
agentDir,
|
||||
agentId: route.agentId,
|
||||
});
|
||||
if (description) {
|
||||
// Format the description with sticker context
|
||||
|
||||
@@ -7,6 +7,9 @@ const middlewareUseSpy = vi.fn();
|
||||
const onSpy = vi.fn();
|
||||
const stopSpy = vi.fn();
|
||||
const sendChatActionSpy = vi.fn();
|
||||
const cacheStickerSpy = vi.fn();
|
||||
const getCachedStickerSpy = vi.fn();
|
||||
const describeStickerImageSpy = vi.fn();
|
||||
|
||||
type ApiStub = {
|
||||
config: { use: (arg: unknown) => void };
|
||||
@@ -79,6 +82,12 @@ vi.mock("../config/sessions.js", async (importOriginal) => {
|
||||
};
|
||||
});
|
||||
|
||||
vi.mock("./sticker-cache.js", () => ({
|
||||
cacheSticker: (...args: unknown[]) => cacheStickerSpy(...args),
|
||||
getCachedSticker: (...args: unknown[]) => getCachedStickerSpy(...args),
|
||||
describeStickerImage: (...args: unknown[]) => describeStickerImageSpy(...args),
|
||||
}));
|
||||
|
||||
vi.mock("./pairing-store.js", () => ({
|
||||
readTelegramAllowFromStore: vi.fn(async () => [] as string[]),
|
||||
upsertTelegramPairingRequest: vi.fn(async () => ({
|
||||
@@ -408,6 +417,12 @@ describe("telegram media groups", () => {
|
||||
describe("telegram stickers", () => {
|
||||
const STICKER_TEST_TIMEOUT_MS = process.platform === "win32" ? 30_000 : 20_000;
|
||||
|
||||
beforeEach(() => {
|
||||
cacheStickerSpy.mockReset();
|
||||
getCachedStickerSpy.mockReset();
|
||||
describeStickerImageSpy.mockReset();
|
||||
});
|
||||
|
||||
it(
|
||||
"downloads static sticker (WEBP) and includes sticker metadata",
|
||||
async () => {
|
||||
@@ -481,6 +496,88 @@ describe("telegram stickers", () => {
|
||||
STICKER_TEST_TIMEOUT_MS,
|
||||
);
|
||||
|
||||
it(
|
||||
"refreshes cached sticker metadata on cache hit",
|
||||
async () => {
|
||||
const { createTelegramBot } = await import("./bot.js");
|
||||
const replyModule = await import("../auto-reply/reply.js");
|
||||
const replySpy = replyModule.__replySpy as unknown as ReturnType<typeof vi.fn>;
|
||||
|
||||
onSpy.mockReset();
|
||||
replySpy.mockReset();
|
||||
sendChatActionSpy.mockReset();
|
||||
|
||||
getCachedStickerSpy.mockReturnValue({
|
||||
fileId: "old_file_id",
|
||||
fileUniqueId: "sticker_unique_456",
|
||||
emoji: "😴",
|
||||
setName: "OldSet",
|
||||
description: "Cached description",
|
||||
cachedAt: "2026-01-20T10:00:00.000Z",
|
||||
});
|
||||
|
||||
const runtimeError = vi.fn();
|
||||
createTelegramBot({
|
||||
token: "tok",
|
||||
runtime: {
|
||||
log: vi.fn(),
|
||||
error: runtimeError,
|
||||
exit: () => {
|
||||
throw new Error("exit");
|
||||
},
|
||||
},
|
||||
});
|
||||
const handler = onSpy.mock.calls.find((call) => call[0] === "message")?.[1] as (
|
||||
ctx: Record<string, unknown>,
|
||||
) => Promise<void>;
|
||||
expect(handler).toBeDefined();
|
||||
|
||||
const fetchSpy = vi.spyOn(globalThis, "fetch" as never).mockResolvedValueOnce({
|
||||
ok: true,
|
||||
status: 200,
|
||||
statusText: "OK",
|
||||
headers: { get: () => "image/webp" },
|
||||
arrayBuffer: async () => new Uint8Array([0x52, 0x49, 0x46, 0x46]).buffer,
|
||||
} as Response);
|
||||
|
||||
await handler({
|
||||
message: {
|
||||
message_id: 103,
|
||||
chat: { id: 1234, type: "private" },
|
||||
sticker: {
|
||||
file_id: "new_file_id",
|
||||
file_unique_id: "sticker_unique_456",
|
||||
type: "regular",
|
||||
width: 512,
|
||||
height: 512,
|
||||
is_animated: false,
|
||||
is_video: false,
|
||||
emoji: "🔥",
|
||||
set_name: "NewSet",
|
||||
},
|
||||
date: 1736380800,
|
||||
},
|
||||
me: { username: "clawdbot_bot" },
|
||||
getFile: async () => ({ file_path: "stickers/sticker.webp" }),
|
||||
});
|
||||
|
||||
expect(runtimeError).not.toHaveBeenCalled();
|
||||
expect(cacheStickerSpy).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
fileId: "new_file_id",
|
||||
emoji: "🔥",
|
||||
setName: "NewSet",
|
||||
}),
|
||||
);
|
||||
const payload = replySpy.mock.calls[0][0];
|
||||
expect(payload.Sticker?.fileId).toBe("new_file_id");
|
||||
expect(payload.Sticker?.cachedDescription).toBe("Cached description");
|
||||
|
||||
fetchSpy.mockRestore();
|
||||
},
|
||||
STICKER_TEST_TIMEOUT_MS,
|
||||
);
|
||||
|
||||
it(
|
||||
"skips animated stickers (TGS format)",
|
||||
async () => {
|
||||
|
||||
@@ -22,7 +22,7 @@ import { buildInlineKeyboard } from "../send.js";
|
||||
import { resolveTelegramVoiceSend } from "../voice.js";
|
||||
import { buildTelegramThreadParams, resolveTelegramReplyId } from "./helpers.js";
|
||||
import type { StickerMetadata, TelegramContext } from "./types.js";
|
||||
import { getCachedSticker } from "../sticker-cache.js";
|
||||
import { cacheSticker, getCachedSticker } from "../sticker-cache.js";
|
||||
|
||||
const PARSE_ERR_RE = /can't parse entities|parse entities|find end of the entity/i;
|
||||
const VOICE_FORBIDDEN_RE = /VOICE_MESSAGES_FORBIDDEN/;
|
||||
@@ -303,14 +303,26 @@ export async function resolveMedia(
|
||||
const cached = sticker.file_unique_id ? getCachedSticker(sticker.file_unique_id) : null;
|
||||
if (cached) {
|
||||
logVerbose(`telegram: sticker cache hit for ${sticker.file_unique_id}`);
|
||||
const fileId = sticker.file_id ?? cached.fileId;
|
||||
const emoji = sticker.emoji ?? cached.emoji;
|
||||
const setName = sticker.set_name ?? cached.setName;
|
||||
if (fileId !== cached.fileId || emoji !== cached.emoji || setName !== cached.setName) {
|
||||
// Refresh cached sticker metadata on hits so sends/searches use latest file_id.
|
||||
cacheSticker({
|
||||
...cached,
|
||||
fileId,
|
||||
emoji,
|
||||
setName,
|
||||
});
|
||||
}
|
||||
return {
|
||||
path: saved.path,
|
||||
contentType: saved.contentType,
|
||||
placeholder: "<media:sticker>",
|
||||
stickerMetadata: {
|
||||
emoji: cached.emoji,
|
||||
setName: cached.setName,
|
||||
fileId: cached.fileId,
|
||||
emoji,
|
||||
setName,
|
||||
fileId,
|
||||
fileUniqueId: sticker.file_unique_id,
|
||||
cachedDescription: cached.description,
|
||||
},
|
||||
@@ -330,7 +342,7 @@ export async function resolveMedia(
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
logVerbose(`telegram: failed to process sticker: ${err}`);
|
||||
logVerbose(`telegram: failed to process sticker: ${String(err)}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js";
|
||||
import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
|
||||
import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
|
||||
import { logVerbose } from "../globals.js";
|
||||
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import {
|
||||
findModelInCatalog,
|
||||
loadModelCatalog,
|
||||
modelSupportsVision,
|
||||
} from "../agents/model-catalog.js";
|
||||
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
|
||||
import { resolveAutoImageModel } from "../media-understanding/runner.js";
|
||||
|
||||
const CACHE_FILE = path.join(STATE_DIR_CLAWDBOT, "telegram", "sticker-cache.json");
|
||||
const CACHE_VERSION = 1;
|
||||
@@ -135,18 +141,11 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?:
|
||||
const STICKER_DESCRIPTION_PROMPT =
|
||||
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
|
||||
|
||||
const VISION_PROVIDERS = ["anthropic", "openai", "google", "minimax"] as const;
|
||||
const DEFAULT_VISION_MODELS: Record<string, string> = {
|
||||
anthropic: "claude-sonnet-4-20250514",
|
||||
openai: "gpt-4o-mini",
|
||||
google: "gemini-2.0-flash",
|
||||
minimax: "MiniMax-VL-01",
|
||||
};
|
||||
|
||||
export interface DescribeStickerParams {
|
||||
imagePath: string;
|
||||
cfg: ClawdbotConfig;
|
||||
agentDir?: string;
|
||||
agentId?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -155,26 +154,35 @@ export interface DescribeStickerParams {
|
||||
* Returns null if no vision provider is available.
|
||||
*/
|
||||
export async function describeStickerImage(params: DescribeStickerParams): Promise<string | null> {
|
||||
const { imagePath, cfg, agentDir } = params;
|
||||
const { imagePath, cfg, agentDir, agentId } = params;
|
||||
|
||||
// Find a vision provider with available API key
|
||||
let provider: string | null = null;
|
||||
for (const p of VISION_PROVIDERS) {
|
||||
try {
|
||||
await resolveApiKeyForProvider({ provider: p, cfg, agentDir });
|
||||
provider = p;
|
||||
break;
|
||||
} catch {
|
||||
// No key for this provider, try next
|
||||
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
|
||||
let activeModel = undefined as { provider: string; model: string } | undefined;
|
||||
try {
|
||||
const catalog = await loadModelCatalog({ config: cfg });
|
||||
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
|
||||
if (modelSupportsVision(entry)) {
|
||||
activeModel = { provider: defaultModel.provider, model: defaultModel.model };
|
||||
}
|
||||
} catch {
|
||||
// Ignore catalog failures; fall back to auto selection.
|
||||
}
|
||||
|
||||
if (!provider) {
|
||||
const resolved = await resolveAutoImageModel({
|
||||
cfg,
|
||||
agentDir,
|
||||
activeModel,
|
||||
});
|
||||
if (!resolved) {
|
||||
logVerbose("telegram: no vision provider available for sticker description");
|
||||
return null;
|
||||
}
|
||||
|
||||
const model = DEFAULT_VISION_MODELS[provider];
|
||||
const { provider, model } = resolved;
|
||||
if (!model) {
|
||||
logVerbose(`telegram: no vision model available for ${provider}`);
|
||||
return null;
|
||||
}
|
||||
logVerbose(`telegram: describing sticker with ${provider}/${model}`);
|
||||
|
||||
try {
|
||||
@@ -195,7 +203,7 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
||||
});
|
||||
return result.text;
|
||||
} catch (err) {
|
||||
logVerbose(`telegram: failed to describe sticker: ${err}`);
|
||||
logVerbose(`telegram: failed to describe sticker: ${String(err)}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user