fix(telegram): send sticker pixels to vision models

This commit is contained in:
Ayaan Zaidi
2026-01-27 13:33:04 +05:30
committed by Ayaan Zaidi
parent 2ad550abe8
commit cc80495baa
3 changed files with 132 additions and 29 deletions

View File

@@ -1,6 +1,12 @@
import type { Bot } from "grammy"; import type { Bot } from "grammy";
import { resolveAckReaction } from "../agents/identity.js"; import { resolveAckReaction } from "../agents/identity.js";
import {
findModelInCatalog,
loadModelCatalog,
modelSupportsVision,
} from "../agents/model-catalog.js";
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
import { hasControlCommand } from "../auto-reply/command-detection.js"; import { hasControlCommand } from "../auto-reply/command-detection.js";
import { normalizeCommandBody } from "../auto-reply/commands-registry.js"; import { normalizeCommandBody } from "../auto-reply/commands-registry.js";
import { formatInboundEnvelope, resolveEnvelopeFormatOptions } from "../auto-reply/envelope.js"; import { formatInboundEnvelope, resolveEnvelopeFormatOptions } from "../auto-reply/envelope.js";
@@ -104,6 +110,24 @@ type BuildTelegramMessageContextParams = {
resolveTelegramGroupConfig: ResolveTelegramGroupConfig; resolveTelegramGroupConfig: ResolveTelegramGroupConfig;
}; };
async function resolveStickerVisionSupport(params: {
cfg: ClawdbotConfig;
agentId?: string;
}): Promise<boolean> {
try {
const catalog = await loadModelCatalog({ config: params.cfg });
const defaultModel = resolveDefaultModelForAgent({
cfg: params.cfg,
agentId: params.agentId,
});
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
if (!entry) return false;
return entry.input ? modelSupportsVision(entry) : true;
} catch {
return false;
}
}
export const buildTelegramMessageContext = async ({ export const buildTelegramMessageContext = async ({
primaryCtx, primaryCtx,
allMedia, allMedia,
@@ -316,7 +340,10 @@ export const buildTelegramMessageContext = async ({
// Check if sticker has a cached description - if so, use it instead of sending the image // Check if sticker has a cached description - if so, use it instead of sending the image
const cachedStickerDescription = allMedia[0]?.stickerMetadata?.cachedDescription; const cachedStickerDescription = allMedia[0]?.stickerMetadata?.cachedDescription;
const stickerCacheHit = Boolean(cachedStickerDescription); const stickerSupportsVision = msg.sticker
? await resolveStickerVisionSupport({ cfg, agentId: route.agentId })
: false;
const stickerCacheHit = Boolean(cachedStickerDescription) && !stickerSupportsVision;
if (stickerCacheHit) { if (stickerCacheHit) {
// Format cached description with sticker context // Format cached description with sticker context
const emoji = allMedia[0]?.stickerMetadata?.emoji; const emoji = allMedia[0]?.stickerMetadata?.emoji;

View File

@@ -1,5 +1,11 @@
// @ts-nocheck // @ts-nocheck
import { EmbeddedBlockChunker } from "../agents/pi-embedded-block-chunker.js"; import { EmbeddedBlockChunker } from "../agents/pi-embedded-block-chunker.js";
import {
findModelInCatalog,
loadModelCatalog,
modelSupportsVision,
} from "../agents/model-catalog.js";
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
import { resolveChunkMode } from "../auto-reply/chunk.js"; import { resolveChunkMode } from "../auto-reply/chunk.js";
import { clearHistoryEntriesIfEnabled } from "../auto-reply/reply/history.js"; import { clearHistoryEntriesIfEnabled } from "../auto-reply/reply/history.js";
import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js"; import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
@@ -15,6 +21,18 @@ import { createTelegramDraftStream } from "./draft-stream.js";
import { cacheSticker, describeStickerImage } from "./sticker-cache.js"; import { cacheSticker, describeStickerImage } from "./sticker-cache.js";
import { resolveAgentDir } from "../agents/agent-scope.js"; import { resolveAgentDir } from "../agents/agent-scope.js";
async function resolveStickerVisionSupport(cfg, agentId) {
try {
const catalog = await loadModelCatalog({ config: cfg });
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
if (!entry) return false;
return entry.input ? modelSupportsVision(entry) : true;
} catch {
return false;
}
}
export const dispatchTelegramMessage = async ({ export const dispatchTelegramMessage = async ({
context, context,
bot, bot,
@@ -133,14 +151,18 @@ export const dispatchTelegramMessage = async ({
// Handle uncached stickers: get a dedicated vision description before dispatch // Handle uncached stickers: get a dedicated vision description before dispatch
// This ensures we cache a raw description rather than a conversational response // This ensures we cache a raw description rather than a conversational response
const sticker = ctxPayload.Sticker; const sticker = ctxPayload.Sticker;
if (sticker?.fileUniqueId && !sticker.cachedDescription && ctxPayload.MediaPath) { if (sticker?.fileUniqueId && ctxPayload.MediaPath) {
const agentDir = resolveAgentDir(cfg, route.agentId); const agentDir = resolveAgentDir(cfg, route.agentId);
const description = await describeStickerImage({ const stickerSupportsVision = await resolveStickerVisionSupport(cfg, route.agentId);
imagePath: ctxPayload.MediaPath, let description = sticker.cachedDescription ?? null;
cfg, if (!description) {
agentDir, description = await describeStickerImage({
agentId: route.agentId, imagePath: ctxPayload.MediaPath,
}); cfg,
agentDir,
agentId: route.agentId,
});
}
if (description) { if (description) {
// Format the description with sticker context // Format the description with sticker context
const stickerContext = [sticker.emoji, sticker.setName ? `from "${sticker.setName}"` : null] const stickerContext = [sticker.emoji, sticker.setName ? `from "${sticker.setName}"` : null]
@@ -148,17 +170,19 @@ export const dispatchTelegramMessage = async ({
.join(" "); .join(" ");
const formattedDesc = `[Sticker${stickerContext ? ` ${stickerContext}` : ""}] ${description}`; const formattedDesc = `[Sticker${stickerContext ? ` ${stickerContext}` : ""}] ${description}`;
// Update context to use description instead of image
sticker.cachedDescription = description; sticker.cachedDescription = description;
ctxPayload.Body = formattedDesc; if (!stickerSupportsVision) {
ctxPayload.BodyForAgent = formattedDesc; // Update context to use description instead of image
// Clear media paths so native vision doesn't process the image again ctxPayload.Body = formattedDesc;
ctxPayload.MediaPath = undefined; ctxPayload.BodyForAgent = formattedDesc;
ctxPayload.MediaType = undefined; // Clear media paths so native vision doesn't process the image again
ctxPayload.MediaUrl = undefined; ctxPayload.MediaPath = undefined;
ctxPayload.MediaPaths = undefined; ctxPayload.MediaType = undefined;
ctxPayload.MediaUrls = undefined; ctxPayload.MediaUrl = undefined;
ctxPayload.MediaTypes = undefined; ctxPayload.MediaPaths = undefined;
ctxPayload.MediaUrls = undefined;
ctxPayload.MediaTypes = undefined;
}
// Cache the description for future encounters // Cache the description for future encounters
cacheSticker({ cacheSticker({

View File

@@ -4,11 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js";
import { STATE_DIR_CLAWDBOT } from "../config/paths.js"; import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
import { loadJsonFile, saveJsonFile } from "../infra/json-file.js"; import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
import { logVerbose } from "../globals.js"; import { logVerbose } from "../globals.js";
import type { ModelCatalogEntry } from "../agents/model-catalog.js";
import { import {
findModelInCatalog, findModelInCatalog,
loadModelCatalog, loadModelCatalog,
modelSupportsVision, modelSupportsVision,
} from "../agents/model-catalog.js"; } from "../agents/model-catalog.js";
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
import { resolveDefaultModelForAgent } from "../agents/model-selection.js"; import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
import { resolveAutoImageModel } from "../media-understanding/runner.js"; import { resolveAutoImageModel } from "../media-understanding/runner.js";
@@ -140,6 +142,7 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?:
const STICKER_DESCRIPTION_PROMPT = const STICKER_DESCRIPTION_PROMPT =
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective."; "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
const VISION_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const;
export interface DescribeStickerParams { export interface DescribeStickerParams {
imagePath: string; imagePath: string;
@@ -158,31 +161,80 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId }); const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
let activeModel = undefined as { provider: string; model: string } | undefined; let activeModel = undefined as { provider: string; model: string } | undefined;
let catalog: ModelCatalogEntry[] = [];
try { try {
const catalog = await loadModelCatalog({ config: cfg }); catalog = await loadModelCatalog({ config: cfg });
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model); const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
if (modelSupportsVision(entry)) { const supportsVision = entry?.input ? modelSupportsVision(entry) : Boolean(entry);
if (supportsVision) {
activeModel = { provider: defaultModel.provider, model: defaultModel.model }; activeModel = { provider: defaultModel.provider, model: defaultModel.model };
} }
} catch { } catch {
// Ignore catalog failures; fall back to auto selection. // Ignore catalog failures; fall back to auto selection.
} }
const resolved = await resolveAutoImageModel({ const hasProviderKey = async (provider: string) => {
cfg, try {
agentDir, await resolveApiKeyForProvider({ provider, cfg, agentDir });
activeModel, return true;
}); } catch {
return false;
}
};
const selectCatalogModel = (provider: string) => {
const entries = catalog.filter(
(entry) =>
entry.provider.toLowerCase() === provider.toLowerCase() &&
(entry.input ? modelSupportsVision(entry) : true),
);
if (entries.length === 0) return undefined;
const defaultId =
provider === "openai"
? "gpt-5-mini"
: provider === "anthropic"
? "claude-opus-4-5"
: provider === "google"
? "gemini-3-flash-preview"
: "MiniMax-VL-01";
const preferred = entries.find((entry) => entry.id === defaultId);
return preferred ?? entries[0];
};
let resolved = null as { provider: string; model?: string } | null;
if (
activeModel &&
VISION_PROVIDERS.includes(activeModel.provider as (typeof VISION_PROVIDERS)[number]) &&
(await hasProviderKey(activeModel.provider))
) {
resolved = activeModel;
}
if (!resolved) { if (!resolved) {
for (const provider of VISION_PROVIDERS) {
if (!(await hasProviderKey(provider))) continue;
const entry = selectCatalogModel(provider);
if (entry) {
resolved = { provider, model: entry.id };
break;
}
}
}
if (!resolved) {
resolved = await resolveAutoImageModel({
cfg,
agentDir,
activeModel,
});
}
if (!resolved?.model) {
logVerbose("telegram: no vision provider available for sticker description"); logVerbose("telegram: no vision provider available for sticker description");
return null; return null;
} }
const { provider, model } = resolved; const { provider, model } = resolved;
if (!model) {
logVerbose(`telegram: no vision model available for ${provider}`);
return null;
}
logVerbose(`telegram: describing sticker with ${provider}/${model}`); logVerbose(`telegram: describing sticker with ${provider}/${model}`);
try { try {