fix(telegram): send sticker pixels to vision models
This commit is contained in:
@@ -1,6 +1,12 @@
|
|||||||
import type { Bot } from "grammy";
|
import type { Bot } from "grammy";
|
||||||
|
|
||||||
import { resolveAckReaction } from "../agents/identity.js";
|
import { resolveAckReaction } from "../agents/identity.js";
|
||||||
|
import {
|
||||||
|
findModelInCatalog,
|
||||||
|
loadModelCatalog,
|
||||||
|
modelSupportsVision,
|
||||||
|
} from "../agents/model-catalog.js";
|
||||||
|
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
|
||||||
import { hasControlCommand } from "../auto-reply/command-detection.js";
|
import { hasControlCommand } from "../auto-reply/command-detection.js";
|
||||||
import { normalizeCommandBody } from "../auto-reply/commands-registry.js";
|
import { normalizeCommandBody } from "../auto-reply/commands-registry.js";
|
||||||
import { formatInboundEnvelope, resolveEnvelopeFormatOptions } from "../auto-reply/envelope.js";
|
import { formatInboundEnvelope, resolveEnvelopeFormatOptions } from "../auto-reply/envelope.js";
|
||||||
@@ -104,6 +110,24 @@ type BuildTelegramMessageContextParams = {
|
|||||||
resolveTelegramGroupConfig: ResolveTelegramGroupConfig;
|
resolveTelegramGroupConfig: ResolveTelegramGroupConfig;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
async function resolveStickerVisionSupport(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
agentId?: string;
|
||||||
|
}): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
const catalog = await loadModelCatalog({ config: params.cfg });
|
||||||
|
const defaultModel = resolveDefaultModelForAgent({
|
||||||
|
cfg: params.cfg,
|
||||||
|
agentId: params.agentId,
|
||||||
|
});
|
||||||
|
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
|
||||||
|
if (!entry) return false;
|
||||||
|
return entry.input ? modelSupportsVision(entry) : true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export const buildTelegramMessageContext = async ({
|
export const buildTelegramMessageContext = async ({
|
||||||
primaryCtx,
|
primaryCtx,
|
||||||
allMedia,
|
allMedia,
|
||||||
@@ -316,7 +340,10 @@ export const buildTelegramMessageContext = async ({
|
|||||||
|
|
||||||
// Check if sticker has a cached description - if so, use it instead of sending the image
|
// Check if sticker has a cached description - if so, use it instead of sending the image
|
||||||
const cachedStickerDescription = allMedia[0]?.stickerMetadata?.cachedDescription;
|
const cachedStickerDescription = allMedia[0]?.stickerMetadata?.cachedDescription;
|
||||||
const stickerCacheHit = Boolean(cachedStickerDescription);
|
const stickerSupportsVision = msg.sticker
|
||||||
|
? await resolveStickerVisionSupport({ cfg, agentId: route.agentId })
|
||||||
|
: false;
|
||||||
|
const stickerCacheHit = Boolean(cachedStickerDescription) && !stickerSupportsVision;
|
||||||
if (stickerCacheHit) {
|
if (stickerCacheHit) {
|
||||||
// Format cached description with sticker context
|
// Format cached description with sticker context
|
||||||
const emoji = allMedia[0]?.stickerMetadata?.emoji;
|
const emoji = allMedia[0]?.stickerMetadata?.emoji;
|
||||||
|
|||||||
@@ -1,5 +1,11 @@
|
|||||||
// @ts-nocheck
|
// @ts-nocheck
|
||||||
import { EmbeddedBlockChunker } from "../agents/pi-embedded-block-chunker.js";
|
import { EmbeddedBlockChunker } from "../agents/pi-embedded-block-chunker.js";
|
||||||
|
import {
|
||||||
|
findModelInCatalog,
|
||||||
|
loadModelCatalog,
|
||||||
|
modelSupportsVision,
|
||||||
|
} from "../agents/model-catalog.js";
|
||||||
|
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
|
||||||
import { resolveChunkMode } from "../auto-reply/chunk.js";
|
import { resolveChunkMode } from "../auto-reply/chunk.js";
|
||||||
import { clearHistoryEntriesIfEnabled } from "../auto-reply/reply/history.js";
|
import { clearHistoryEntriesIfEnabled } from "../auto-reply/reply/history.js";
|
||||||
import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
|
import { dispatchReplyWithBufferedBlockDispatcher } from "../auto-reply/reply/provider-dispatcher.js";
|
||||||
@@ -15,6 +21,18 @@ import { createTelegramDraftStream } from "./draft-stream.js";
|
|||||||
import { cacheSticker, describeStickerImage } from "./sticker-cache.js";
|
import { cacheSticker, describeStickerImage } from "./sticker-cache.js";
|
||||||
import { resolveAgentDir } from "../agents/agent-scope.js";
|
import { resolveAgentDir } from "../agents/agent-scope.js";
|
||||||
|
|
||||||
|
async function resolveStickerVisionSupport(cfg, agentId) {
|
||||||
|
try {
|
||||||
|
const catalog = await loadModelCatalog({ config: cfg });
|
||||||
|
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
|
||||||
|
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
|
||||||
|
if (!entry) return false;
|
||||||
|
return entry.input ? modelSupportsVision(entry) : true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export const dispatchTelegramMessage = async ({
|
export const dispatchTelegramMessage = async ({
|
||||||
context,
|
context,
|
||||||
bot,
|
bot,
|
||||||
@@ -133,14 +151,18 @@ export const dispatchTelegramMessage = async ({
|
|||||||
// Handle uncached stickers: get a dedicated vision description before dispatch
|
// Handle uncached stickers: get a dedicated vision description before dispatch
|
||||||
// This ensures we cache a raw description rather than a conversational response
|
// This ensures we cache a raw description rather than a conversational response
|
||||||
const sticker = ctxPayload.Sticker;
|
const sticker = ctxPayload.Sticker;
|
||||||
if (sticker?.fileUniqueId && !sticker.cachedDescription && ctxPayload.MediaPath) {
|
if (sticker?.fileUniqueId && ctxPayload.MediaPath) {
|
||||||
const agentDir = resolveAgentDir(cfg, route.agentId);
|
const agentDir = resolveAgentDir(cfg, route.agentId);
|
||||||
const description = await describeStickerImage({
|
const stickerSupportsVision = await resolveStickerVisionSupport(cfg, route.agentId);
|
||||||
imagePath: ctxPayload.MediaPath,
|
let description = sticker.cachedDescription ?? null;
|
||||||
cfg,
|
if (!description) {
|
||||||
agentDir,
|
description = await describeStickerImage({
|
||||||
agentId: route.agentId,
|
imagePath: ctxPayload.MediaPath,
|
||||||
});
|
cfg,
|
||||||
|
agentDir,
|
||||||
|
agentId: route.agentId,
|
||||||
|
});
|
||||||
|
}
|
||||||
if (description) {
|
if (description) {
|
||||||
// Format the description with sticker context
|
// Format the description with sticker context
|
||||||
const stickerContext = [sticker.emoji, sticker.setName ? `from "${sticker.setName}"` : null]
|
const stickerContext = [sticker.emoji, sticker.setName ? `from "${sticker.setName}"` : null]
|
||||||
@@ -148,17 +170,19 @@ export const dispatchTelegramMessage = async ({
|
|||||||
.join(" ");
|
.join(" ");
|
||||||
const formattedDesc = `[Sticker${stickerContext ? ` ${stickerContext}` : ""}] ${description}`;
|
const formattedDesc = `[Sticker${stickerContext ? ` ${stickerContext}` : ""}] ${description}`;
|
||||||
|
|
||||||
// Update context to use description instead of image
|
|
||||||
sticker.cachedDescription = description;
|
sticker.cachedDescription = description;
|
||||||
ctxPayload.Body = formattedDesc;
|
if (!stickerSupportsVision) {
|
||||||
ctxPayload.BodyForAgent = formattedDesc;
|
// Update context to use description instead of image
|
||||||
// Clear media paths so native vision doesn't process the image again
|
ctxPayload.Body = formattedDesc;
|
||||||
ctxPayload.MediaPath = undefined;
|
ctxPayload.BodyForAgent = formattedDesc;
|
||||||
ctxPayload.MediaType = undefined;
|
// Clear media paths so native vision doesn't process the image again
|
||||||
ctxPayload.MediaUrl = undefined;
|
ctxPayload.MediaPath = undefined;
|
||||||
ctxPayload.MediaPaths = undefined;
|
ctxPayload.MediaType = undefined;
|
||||||
ctxPayload.MediaUrls = undefined;
|
ctxPayload.MediaUrl = undefined;
|
||||||
ctxPayload.MediaTypes = undefined;
|
ctxPayload.MediaPaths = undefined;
|
||||||
|
ctxPayload.MediaUrls = undefined;
|
||||||
|
ctxPayload.MediaTypes = undefined;
|
||||||
|
}
|
||||||
|
|
||||||
// Cache the description for future encounters
|
// Cache the description for future encounters
|
||||||
cacheSticker({
|
cacheSticker({
|
||||||
|
|||||||
@@ -4,11 +4,13 @@ import type { ClawdbotConfig } from "../config/config.js";
|
|||||||
import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
|
import { STATE_DIR_CLAWDBOT } from "../config/paths.js";
|
||||||
import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
|
import { loadJsonFile, saveJsonFile } from "../infra/json-file.js";
|
||||||
import { logVerbose } from "../globals.js";
|
import { logVerbose } from "../globals.js";
|
||||||
|
import type { ModelCatalogEntry } from "../agents/model-catalog.js";
|
||||||
import {
|
import {
|
||||||
findModelInCatalog,
|
findModelInCatalog,
|
||||||
loadModelCatalog,
|
loadModelCatalog,
|
||||||
modelSupportsVision,
|
modelSupportsVision,
|
||||||
} from "../agents/model-catalog.js";
|
} from "../agents/model-catalog.js";
|
||||||
|
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||||
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
|
import { resolveDefaultModelForAgent } from "../agents/model-selection.js";
|
||||||
import { resolveAutoImageModel } from "../media-understanding/runner.js";
|
import { resolveAutoImageModel } from "../media-understanding/runner.js";
|
||||||
|
|
||||||
@@ -140,6 +142,7 @@ export function getCacheStats(): { count: number; oldestAt?: string; newestAt?:
|
|||||||
|
|
||||||
const STICKER_DESCRIPTION_PROMPT =
|
const STICKER_DESCRIPTION_PROMPT =
|
||||||
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
|
"Describe this sticker image in 1-2 sentences. Focus on what the sticker depicts (character, object, action, emotion). Be concise and objective.";
|
||||||
|
const VISION_PROVIDERS = ["openai", "anthropic", "google", "minimax"] as const;
|
||||||
|
|
||||||
export interface DescribeStickerParams {
|
export interface DescribeStickerParams {
|
||||||
imagePath: string;
|
imagePath: string;
|
||||||
@@ -158,31 +161,80 @@ export async function describeStickerImage(params: DescribeStickerParams): Promi
|
|||||||
|
|
||||||
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
|
const defaultModel = resolveDefaultModelForAgent({ cfg, agentId });
|
||||||
let activeModel = undefined as { provider: string; model: string } | undefined;
|
let activeModel = undefined as { provider: string; model: string } | undefined;
|
||||||
|
let catalog: ModelCatalogEntry[] = [];
|
||||||
try {
|
try {
|
||||||
const catalog = await loadModelCatalog({ config: cfg });
|
catalog = await loadModelCatalog({ config: cfg });
|
||||||
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
|
const entry = findModelInCatalog(catalog, defaultModel.provider, defaultModel.model);
|
||||||
if (modelSupportsVision(entry)) {
|
const supportsVision = entry?.input ? modelSupportsVision(entry) : Boolean(entry);
|
||||||
|
if (supportsVision) {
|
||||||
activeModel = { provider: defaultModel.provider, model: defaultModel.model };
|
activeModel = { provider: defaultModel.provider, model: defaultModel.model };
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// Ignore catalog failures; fall back to auto selection.
|
// Ignore catalog failures; fall back to auto selection.
|
||||||
}
|
}
|
||||||
|
|
||||||
const resolved = await resolveAutoImageModel({
|
const hasProviderKey = async (provider: string) => {
|
||||||
cfg,
|
try {
|
||||||
agentDir,
|
await resolveApiKeyForProvider({ provider, cfg, agentDir });
|
||||||
activeModel,
|
return true;
|
||||||
});
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const selectCatalogModel = (provider: string) => {
|
||||||
|
const entries = catalog.filter(
|
||||||
|
(entry) =>
|
||||||
|
entry.provider.toLowerCase() === provider.toLowerCase() &&
|
||||||
|
(entry.input ? modelSupportsVision(entry) : true),
|
||||||
|
);
|
||||||
|
if (entries.length === 0) return undefined;
|
||||||
|
const defaultId =
|
||||||
|
provider === "openai"
|
||||||
|
? "gpt-5-mini"
|
||||||
|
: provider === "anthropic"
|
||||||
|
? "claude-opus-4-5"
|
||||||
|
: provider === "google"
|
||||||
|
? "gemini-3-flash-preview"
|
||||||
|
: "MiniMax-VL-01";
|
||||||
|
const preferred = entries.find((entry) => entry.id === defaultId);
|
||||||
|
return preferred ?? entries[0];
|
||||||
|
};
|
||||||
|
|
||||||
|
let resolved = null as { provider: string; model?: string } | null;
|
||||||
|
if (
|
||||||
|
activeModel &&
|
||||||
|
VISION_PROVIDERS.includes(activeModel.provider as (typeof VISION_PROVIDERS)[number]) &&
|
||||||
|
(await hasProviderKey(activeModel.provider))
|
||||||
|
) {
|
||||||
|
resolved = activeModel;
|
||||||
|
}
|
||||||
|
|
||||||
if (!resolved) {
|
if (!resolved) {
|
||||||
|
for (const provider of VISION_PROVIDERS) {
|
||||||
|
if (!(await hasProviderKey(provider))) continue;
|
||||||
|
const entry = selectCatalogModel(provider);
|
||||||
|
if (entry) {
|
||||||
|
resolved = { provider, model: entry.id };
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resolved) {
|
||||||
|
resolved = await resolveAutoImageModel({
|
||||||
|
cfg,
|
||||||
|
agentDir,
|
||||||
|
activeModel,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!resolved?.model) {
|
||||||
logVerbose("telegram: no vision provider available for sticker description");
|
logVerbose("telegram: no vision provider available for sticker description");
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const { provider, model } = resolved;
|
const { provider, model } = resolved;
|
||||||
if (!model) {
|
|
||||||
logVerbose(`telegram: no vision model available for ${provider}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
logVerbose(`telegram: describing sticker with ${provider}/${model}`);
|
logVerbose(`telegram: describing sticker with ${provider}/${model}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|||||||
Reference in New Issue
Block a user