From cc80495baadcd8ced64b49d7728074b0451da365 Mon Sep 17 00:00:00 2001 From: Ayaan Zaidi Date: Tue, 27 Jan 2026 13:33:04 +0530 Subject: [PATCH] fix(telegram): send sticker pixels to vision models --- src/telegram/bot-message-context.ts | 29 ++++++++++- src/telegram/bot-message-dispatch.ts | 58 +++++++++++++++------- src/telegram/sticker-cache.ts | 74 +++++++++++++++++++++++----- 3 files changed, 132 insertions(+), 29 deletions(-) diff --git a/src/telegram/bot-message-context.ts b/src/telegram/bot-message-context.ts index 71ac8a011..3f2c4af57 100644 --- a/src/telegram/bot-message-context.ts +++ b/src/telegram/bot-message-context.ts @@ -1,6 +1,12 @@ import type { Bot } from "grammy"; import { resolveAckReaction } from "../agents/identity.js"; +import { + findModelInCatalog, + loadModelCatalog, + modelSupportsVision, +} from "../agents/model-catalog.js"; +import { resolveDefaultModelForAgent } from "../agents/model-selection.js"; import { hasControlCommand } from "../auto-reply/command-detection.js"; import { normalizeCommandBody } from "../auto-reply/commands-registry.js"; import { formatInboundEnvelope, resolveEnvelopeFormatOptions } from "../auto-reply/envelope.js"; @@ -104,6 +110,24 @@ type BuildTelegramMessageContextParams = { resolveTelegramGroupConfig: ResolveTelegramGroupConfig; }; +async function resolveStickerVisionSupport(params: { + cfg: ClawdbotConfig; + agentId?: string; +}): Promise { + try { + const catalog = await loadModelCatalog({ config: params.cfg }); + const defaultModel = resolveDefaultModelForAgent({ + cfg: params.cfg, + agentId: params.agentId, + }); + const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model); + if (!entry) return false; + return entry.input ? modelSupportsVision(entry) : true; + } catch { + return false; + } +} + export const buildTelegramMessageContext = async ({ primaryCtx, allMedia, @@ -316,7 +340,10 @@ export const buildTelegramMessageContext = async ({ // Check if sticker has a cached description - if so, use it instead of sending the image const cachedStickerDescription = allMedia[0]?.stickerMetadata?.cachedDescription; - const stickerCacheHit = Boolean(cachedStickerDescription); + const stickerSupportsVision = msg.sticker + ? await resolveStickerVisionSupport({ cfg, agentId: route.agentId }) + : false; + const stickerCacheHit = Boolean(cachedStickerDescription) && !stickerSupportsVision; if (stickerCacheHit) { // Format cached description with sticker context const emoji = allMedia[0]?.stickerMetadata?.emoji; diff --git a/src/telegram/bot-message-dispatch.ts b/src/telegram/bot-message-dispatch.ts index a3e9c3faa..7c5929e5a 100644 --- a/src/telegram/bot-message-dispatch.ts +++ b/src/telegram/bot-message-dispatch.ts @@ -1,5 +1,11 @@ // @ts-nocheck import { EmbeddedBlockChunker } from "../agents/pi-embedded-block-chunker.js"; +import { + findModelInCatalog, + loadModelCatalog, + modelSupportsVision, +} from "../agents/model-catalog.js"; +import { resolveDefaultModelForAgent } from "../agents/model-selection.js"; import { resolveChunkMode } from "../auto-reply/chunk.js"; import { clearHistoryEntriesIfEnabled } from "../auto-reply/reply/history.js"; import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js"; @@ -15,6 +21,18 @@ import { createTelegramDraftStream } from "./draft-stream.js"; import { cacheSticker, describeStickerImage } from "./sticker-cache.js"; import { resolveAgentDir } from "../agents/agent-scope.js"; +async function resolveStickerVisionSupport(cfg, agentId) { + try { + const catalog = await loadModelCatalog({ config: cfg }); + const defaultModel = resolveDefaultModelForAgent({ cfg, agentId }); + const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model); + if (!entry) return false; + return entry.input ? modelSupportsVision(entry) : true; + } catch { + return false; + } +} + export const dispatchTelegramMessage = async ({ context, bot, @@ -133,14 +151,18 @@ export const dispatchTelegramMessage = async ({ // Handle uncached stickers: get a dedicated vision description before dispatch // This ensures we cache a raw description rather than a conversational response const sticker = ctxPayload.Sticker; - if (sticker?.fileUniqueId && !sticker.cachedDescription && ctxPayload.MediaPath) { + if (sticker?.fileUniqueId && ctxPayload.MediaPath) { const agentDir = resolveAgentDir(cfg, route.agentId); - const description = await describeStickerImage({ - imagePath: ctxPayload.MediaPath, - cfg, - agentDir, - agentId: route.agentId, - }); + const stickerSupportsVision = await resolveStickerVisionSupport(cfg, route.agentId); + let description = sticker.cachedDescription ?? null; + if (!description) { + description = await describeStickerImage({ + imagePath: ctxPayload.MediaPath, + cfg, + agentDir, + agentId: route.agentId, + }); + } if (description) { // Format the description with sticker context const stickerContext = [sticker.emoji, sticker.setName ? `from "${sticker.setName}"` : null] @@ -148,17 +170,19 @@ export const dispatchTelegramMessage = async ({ .join(" "); const formattedDesc = `[Sticker${stickerContext ? ` ${stickerContext}` : ""}] ${description}`; - // Update context to use description instead of image sticker.cachedDescription = description; - ctxPayload.Body = formattedDesc; - ctxPayload.BodyForAgent = formattedDesc; - // Clear media paths so native vision doesn't process the image again - ctxPayload.MediaPath = undefined; - ctxPayload.MediaType = undefined; - ctxPayload.MediaUrl = undefined; - ctxPayload.MediaPaths = undefined; - ctxPayload.MediaUrls = undefined; - ctxPayload.MediaTypes = undefined; + if (!stickerSupportsVision) { + // Update context to use description instead of image + ctxPayload.Body = formattedDesc; + ctxPayload.BodyForAgent = formattedDesc; + // Clear media paths so native vision doesn't process the image again + ctxPayload.MediaPath = undefined; + ctxPayload.MediaType = undefined; + ctxPayload.MediaUrl = undefined; + ctxPayload.MediaPaths = undefined; + ctxPayload.MediaUrls = undefined; + ctxPayload.MediaTypes = undefined; + } // Cache the description for future encounters cacheSticker({ diff --git a/src/telegram/sticker-cache.ts b/src/telegram/sticker-cache.ts index 38f421851..5c517ac12 100644 --- a/src/telegram/sticker-cache.ts +++ b/src/telegram/sticker-cache.ts @@ -4,11 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js"; import { STATE_DIR_CLAWDBOT } from "../config/paths.js"; import { loadJsonFile, saveJsonFile } from "../infra/json-file.js"; import { logVerbose } from "../globals.js"; +import type { ModelCatalogEntry } from "../agents/model-catalog.js"; import { findModelInCatalog, loadModelCatalog, modelSupportsVision, } from "../agents/model-catalog.js"; +import { resolveApiKeyForProvider } from "../agents/model-auth.js"; import { resolveDefaultModelForAgent } from "../agents/model-selection.js"; import { resolveAutoImageModel } from "../media-understanding/runner.js"; @@ -140,6 +142,7 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?: const STICKER_DESCRIPTION_PROMPT = "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective."; +const VISION_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const; export interface DescribeStickerParams { imagePath: string; @@ -158,31 +161,80 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi const defaultModel = resolveDefaultModelForAgent({ cfg, agentId }); let activeModel = undefined as { provider: string; model: string } | undefined; + let catalog: ModelCatalogEntry[] = []; try { - const catalog = await loadModelCatalog({ config: cfg }); + catalog = await loadModelCatalog({ config: cfg }); const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model); - if (modelSupportsVision(entry)) { + const supportsVision = entry?.input ? modelSupportsVision(entry) : Boolean(entry); + if (supportsVision) { activeModel = { provider: defaultModel.provider, model: defaultModel.model }; } } catch { // Ignore catalog failures; fall back to auto selection. } - const resolved = await resolveAutoImageModel({ - cfg, - agentDir, - activeModel, - }); + const hasProviderKey = async (provider: string) => { + try { + await resolveApiKeyForProvider({ provider, cfg, agentDir }); + return true; + } catch { + return false; + } + }; + + const selectCatalogModel = (provider: string) => { + const entries = catalog.filter( + (entry) => + entry.provider.toLowerCase() === provider.toLowerCase() && + (entry.input ? modelSupportsVision(entry) : true), + ); + if (entries.length === 0) return undefined; + const defaultId = + provider === "openai" + ? "gpt-5-mini" + : provider === "anthropic" + ? "claude-opus-4-5" + : provider === "google" + ? "gemini-3-flash-preview" + : "MiniMax-VL-01"; + const preferred = entries.find((entry) => entry.id === defaultId); + return preferred ?? entries[0]; + }; + + let resolved = null as { provider: string; model?: string } | null; + if ( + activeModel && + VISION_PROVIDERS.includes(activeModel.provider as (typeof VISION_PROVIDERS)[number]) && + (await hasProviderKey(activeModel.provider)) + ) { + resolved = activeModel; + } + if (!resolved) { + for (const provider of VISION_PROVIDERS) { + if (!(await hasProviderKey(provider))) continue; + const entry = selectCatalogModel(provider); + if (entry) { + resolved = { provider, model: entry.id }; + break; + } + } + } + + if (!resolved) { + resolved = await resolveAutoImageModel({ + cfg, + agentDir, + activeModel, + }); + } + + if (!resolved?.model) { logVerbose("telegram: no vision provider available for sticker description"); return null; } const { provider, model } = resolved; - if (!model) { - logVerbose(`telegram: no vision model available for ${provider}`); - return null; - } logVerbose(`telegram: describing sticker with ${provider}/${model}`); try {