fix(telegram): send sticker pixels to vision models

2026-01-27 13:33:04 +05:30
parent 2ad550abe8
commit cc80495baa
3 changed files with 132 additions and 29 deletions
--- a/src/telegram/bot-message-context.ts
+++ b/src/telegram/bot-message-context.ts
@@ -1,6 +1,12 @@
 import type { Bot } from "grammy";

 import { resolveAckReaction } from "../agents/identity.js";
+import {
+  findModelInCatalog,
+  loadModelCatalog,
+  modelSupportsVision,
+} from "../agents/model-catalog.js";
+import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
 import { hasControlCommand } from "../auto-reply/command-detection.js";
 import { normalizeCommandBody } from "../auto-reply/commands-registry.js";
 import { formatInboundEnvelope, resolveEnvelopeFormatOptions } from "../auto-reply/envelope.js";
@@ -104,6 +110,24 @@ type BuildTelegramMessageContextParams = {
  resolveTelegramGroupConfig: ResolveTelegramGroupConfig;
 };

+async function resolveStickerVisionSupport(params: {
+  cfg: ClawdbotConfig;
+  agentId?: string;
+}): Promise<boolean> {
+  try {
+    const catalog = await loadModelCatalog({ config: params.cfg });
+    const defaultModel = resolveDefaultModelForAgent({
+      cfg: params.cfg,
+      agentId: params.agentId,
+    });
+    const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
+    if (!entry) return false;
+    return entry.input ? modelSupportsVision(entry) : true;
+  } catch {
+    return false;
+  }
+}
+
 export const buildTelegramMessageContext = async ({
  primaryCtx,
  allMedia,
@@ -316,7 +340,10 @@ export const buildTelegramMessageContext = async ({

  // Check if sticker has a cached description - if so, use it instead of sending the image
  const cachedStickerDescription = allMedia[0]?.stickerMetadata?.cachedDescription;
-  const stickerCacheHit = Boolean(cachedStickerDescription);
+  const stickerSupportsVision = msg.sticker
+    ? await resolveStickerVisionSupport({ cfg, agentId: route.agentId })
+    : false;
+  const stickerCacheHit = Boolean(cachedStickerDescription) && !stickerSupportsVision;
  if (stickerCacheHit) {
    // Format cached description with sticker context
    const emoji = allMedia[0]?.stickerMetadata?.emoji;
--- a/src/telegram/bot-message-dispatch.ts
+++ b/src/telegram/bot-message-dispatch.ts
@@ -1,5 +1,11 @@
 // @ts-nocheck
 import { EmbeddedBlockChunker } from "../agents/pi-embedded-block-chunker.js";
+import {
+  findModelInCatalog,
+  loadModelCatalog,
+  modelSupportsVision,
+} from "../agents/model-catalog.js";
+import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
 import { resolveChunkMode } from "../auto-reply/chunk.js";
 import { clearHistoryEntriesIfEnabled } from "../auto-reply/reply/history.js";
 import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
@@ -15,6 +21,18 @@ import { createTelegramDraftStream } from "./draft-stream.js";
 import { cacheSticker, describeStickerImage } from "./sticker-cache.js";
 import { resolveAgentDir } from "../agents/agent-scope.js";

+async function resolveStickerVisionSupport(cfg, agentId) {
+  try {
+    const catalog = await loadModelCatalog({ config: cfg });
+    const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
+    const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
+    if (!entry) return false;
+    return entry.input ? modelSupportsVision(entry) : true;
+  } catch {
+    return false;
+  }
+}
+
 export const dispatchTelegramMessage = async ({
  context,
  bot,
@@ -133,14 +151,18 @@ export const dispatchTelegramMessage = async ({
  // Handle uncached stickers: get a dedicated vision description before dispatch
  // This ensures we cache a raw description rather than a conversational response
  const sticker = ctxPayload.Sticker;
-  if (sticker?.fileUniqueId && !sticker.cachedDescription && ctxPayload.MediaPath) {
+  if (sticker?.fileUniqueId && ctxPayload.MediaPath) {
    const agentDir = resolveAgentDir(cfg, route.agentId);
-    const description = await describeStickerImage({
-      imagePath: ctxPayload.MediaPath,
-      cfg,
-      agentDir,
-      agentId: route.agentId,
-    });
+    const stickerSupportsVision = await resolveStickerVisionSupport(cfg, route.agentId);
+    let description = sticker.cachedDescription ?? null;
+    if (!description) {
+      description = await describeStickerImage({
+        imagePath: ctxPayload.MediaPath,
+        cfg,
+        agentDir,
+        agentId: route.agentId,
+      });
+    }
    if (description) {
      // Format the description with sticker context
      const stickerContext = [sticker.emoji, sticker.setName ? `from "${sticker.setName}"` : null]
@@ -148,17 +170,19 @@ export const dispatchTelegramMessage = async ({
        .join(" ");
      const formattedDesc = `[Sticker${stickerContext ? ` ${stickerContext}` : ""}] ${description}`;

-      // Update context to use description instead of image
      sticker.cachedDescription = description;
-      ctxPayload.Body = formattedDesc;
-      ctxPayload.BodyForAgent = formattedDesc;
-      // Clear media paths so native vision doesn't process the image again
-      ctxPayload.MediaPath = undefined;
-      ctxPayload.MediaType = undefined;
-      ctxPayload.MediaUrl = undefined;
-      ctxPayload.MediaPaths = undefined;
-      ctxPayload.MediaUrls = undefined;
-      ctxPayload.MediaTypes = undefined;
+      if (!stickerSupportsVision) {
+        // Update context to use description instead of image
+        ctxPayload.Body = formattedDesc;
+        ctxPayload.BodyForAgent = formattedDesc;
+        // Clear media paths so native vision doesn't process the image again
+        ctxPayload.MediaPath = undefined;
+        ctxPayload.MediaType = undefined;
+        ctxPayload.MediaUrl = undefined;
+        ctxPayload.MediaPaths = undefined;
+        ctxPayload.MediaUrls = undefined;
+        ctxPayload.MediaTypes = undefined;
+      }

      // Cache the description for future encounters
      cacheSticker({
--- a/src/telegram/sticker-cache.ts
+++ b/src/telegram/sticker-cache.ts
@@ -4,11 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js";
 import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
 import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
 import { logVerbose } from "../globals.js";
+import type { ModelCatalogEntry } from "../agents/model-catalog.js";
 import {
  findModelInCatalog,
  loadModelCatalog,
  modelSupportsVision,
 } from "../agents/model-catalog.js";
+import { resolveApiKeyForProvider } from "../agents/model-auth.js";
 import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
 import { resolveAutoImageModel } from "../media-understanding/runner.js";

@@ -140,6 +142,7 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?:

 const STICKER_DESCRIPTION_PROMPT =
  "Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
+const VISION_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const;

 export interface DescribeStickerParams {
  imagePath: string;
@@ -158,31 +161,80 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi

  const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
  let activeModel = undefined as { provider: string; model: string } | undefined;
+  let catalog: ModelCatalogEntry[] = [];
  try {
-    const catalog = await loadModelCatalog({ config: cfg });
+    catalog = await loadModelCatalog({ config: cfg });
    const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
-    if (modelSupportsVision(entry)) {
+    const supportsVision = entry?.input ? modelSupportsVision(entry) : Boolean(entry);
+    if (supportsVision) {
      activeModel = { provider: defaultModel.provider, model: defaultModel.model };
    }
  } catch {
    // Ignore catalog failures; fall back to auto selection.
  }

-  const resolved = await resolveAutoImageModel({
-    cfg,
-    agentDir,
-    activeModel,
-  });
+  const hasProviderKey = async (provider: string) => {
+    try {
+      await resolveApiKeyForProvider({ provider, cfg, agentDir });
+      return true;
+    } catch {
+      return false;
+    }
+  };
+
+  const selectCatalogModel = (provider: string) => {
+    const entries = catalog.filter(
+      (entry) =>
+        entry.provider.toLowerCase() === provider.toLowerCase() &&
+        (entry.input ? modelSupportsVision(entry) : true),
+    );
+    if (entries.length === 0) return undefined;
+    const defaultId =
+      provider === "openai"
+        ? "gpt-5-mini"
+        : provider === "anthropic"
+          ? "claude-opus-4-5"
+          : provider === "google"
+            ? "gemini-3-flash-preview"
+            : "MiniMax-VL-01";
+    const preferred = entries.find((entry) => entry.id === defaultId);
+    return preferred ?? entries[0];
+  };
+
+  let resolved = null as { provider: string; model?: string } | null;
+  if (
+    activeModel &&
+    VISION_PROVIDERS.includes(activeModel.provider as (typeof VISION_PROVIDERS)[number]) &&
+    (await hasProviderKey(activeModel.provider))
+  ) {
+    resolved = activeModel;
+  }
+
  if (!resolved) {
+    for (const provider of VISION_PROVIDERS) {
+      if (!(await hasProviderKey(provider))) continue;
+      const entry = selectCatalogModel(provider);
+      if (entry) {
+        resolved = { provider, model: entry.id };
+        break;
+      }
+    }
+  }
+
+  if (!resolved) {
+    resolved = await resolveAutoImageModel({
+      cfg,
+      agentDir,
+      activeModel,
+    });
+  }
+
+  if (!resolved?.model) {
    logVerbose("telegram: no vision provider available for sticker description");
    return null;
  }

  const { provider, model } = resolved;
-  if (!model) {
-    logVerbose(`telegram: no vision model available for ${provider}`);
-    return null;
-  }
  logVerbose(`telegram: describing sticker with ${provider}/${model}`);

  try {