refactor: extend media understanding
This commit is contained in:
@@ -68,6 +68,7 @@ export const handleStatusCommand: CommandHandler = async (params, allowTextComma
|
||||
resolveDefaultThinkingLevel: params.resolveDefaultThinkingLevel,
|
||||
isGroup: params.isGroup,
|
||||
defaultGroupActivation: params.defaultGroupActivation,
|
||||
mediaDecisions: params.ctx.MediaUnderstandingDecisions,
|
||||
});
|
||||
return { shouldContinue: false, reply };
|
||||
};
|
||||
|
||||
@@ -24,6 +24,7 @@ import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "..
|
||||
import type { ReplyPayload } from "../types.js";
|
||||
import type { CommandContext } from "./commands-types.js";
|
||||
import { getFollowupQueueDepth, resolveQueueSettings } from "./queue.js";
|
||||
import type { MediaUnderstandingDecision } from "../../media-understanding/types.js";
|
||||
|
||||
function formatApiKeySnippet(apiKey: string): string {
|
||||
const compact = apiKey.replace(/\s+/g, "");
|
||||
@@ -105,6 +106,7 @@ export async function buildStatusReply(params: {
|
||||
resolveDefaultThinkingLevel: () => Promise<ThinkLevel | undefined>;
|
||||
isGroup: boolean;
|
||||
defaultGroupActivation: () => "always" | "mention";
|
||||
mediaDecisions?: MediaUnderstandingDecision[];
|
||||
}): Promise<ReplyPayload | undefined> {
|
||||
const {
|
||||
cfg,
|
||||
@@ -200,6 +202,7 @@ export async function buildStatusReply(params: {
|
||||
dropPolicy: queueSettings.dropPolicy,
|
||||
showDetails: queueOverrides,
|
||||
},
|
||||
mediaDecisions: params.mediaDecisions,
|
||||
includeTranscriptUsage: false,
|
||||
});
|
||||
|
||||
|
||||
@@ -188,6 +188,7 @@ export async function applyInlineDirectiveOverrides(params: {
|
||||
resolveDefaultThinkingLevel: async () => resolvedDefaultThinkLevel,
|
||||
isGroup,
|
||||
defaultGroupActivation: defaultActivation,
|
||||
mediaDecisions: ctx.MediaUnderstandingDecisions,
|
||||
});
|
||||
}
|
||||
typing.cleanup();
|
||||
|
||||
@@ -185,6 +185,7 @@ export async function handleInlineActions(params: {
|
||||
resolveDefaultThinkingLevel,
|
||||
isGroup,
|
||||
defaultGroupActivation: defaultActivation,
|
||||
mediaDecisions: ctx.MediaUnderstandingDecisions,
|
||||
});
|
||||
await sendInlineReply(inlineStatusReply);
|
||||
directives = { ...directives, hasStatusDirective: false };
|
||||
|
||||
@@ -90,6 +90,59 @@ describe("buildStatusMessage", () => {
|
||||
expect(text).toContain("elevated");
|
||||
});
|
||||
|
||||
it("includes media understanding decisions when present", () => {
|
||||
const text = buildStatusMessage({
|
||||
agent: { model: "anthropic/claude-opus-4-5" },
|
||||
sessionEntry: { sessionId: "media", updatedAt: 0 },
|
||||
sessionKey: "agent:main:main",
|
||||
queue: { mode: "none" },
|
||||
mediaDecisions: [
|
||||
{
|
||||
capability: "image",
|
||||
outcome: "success",
|
||||
attachments: [
|
||||
{
|
||||
attachmentIndex: 0,
|
||||
attempts: [
|
||||
{
|
||||
type: "provider",
|
||||
outcome: "success",
|
||||
provider: "openai",
|
||||
model: "gpt-5.2",
|
||||
},
|
||||
],
|
||||
chosen: {
|
||||
type: "provider",
|
||||
outcome: "success",
|
||||
provider: "openai",
|
||||
model: "gpt-5.2",
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
capability: "audio",
|
||||
outcome: "skipped",
|
||||
attachments: [
|
||||
{
|
||||
attachmentIndex: 1,
|
||||
attempts: [
|
||||
{
|
||||
type: "provider",
|
||||
outcome: "skipped",
|
||||
reason: "maxBytes: too large",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const normalized = normalizeTestText(text);
|
||||
expect(normalized).toContain("Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)");
|
||||
});
|
||||
|
||||
it("does not show elevated label when session explicitly disables it", () => {
|
||||
const text = buildStatusMessage({
|
||||
agent: { model: "anthropic/claude-opus-4-5", elevatedDefault: "on" },
|
||||
|
||||
@@ -24,6 +24,7 @@ import { VERSION } from "../version.js";
|
||||
import { listChatCommands, listChatCommandsForConfig } from "./commands-registry.js";
|
||||
import type { SkillCommandSpec } from "../agents/skills.js";
|
||||
import type { ElevatedLevel, ReasoningLevel, ThinkLevel, VerboseLevel } from "./thinking.js";
|
||||
import type { MediaUnderstandingDecision } from "../media-understanding/types.js";
|
||||
|
||||
type AgentConfig = Partial<NonNullable<NonNullable<ClawdbotConfig["agents"]>["defaults"]>>;
|
||||
|
||||
@@ -52,6 +53,7 @@ type StatusArgs = {
|
||||
modelAuth?: string;
|
||||
usageLine?: string;
|
||||
queue?: QueueStatus;
|
||||
mediaDecisions?: MediaUnderstandingDecision[];
|
||||
includeTranscriptUsage?: boolean;
|
||||
now?: number;
|
||||
};
|
||||
@@ -167,6 +169,42 @@ const formatUsagePair = (input?: number | null, output?: number | null) => {
|
||||
return `🧮 Tokens: ${inputLabel} in / ${outputLabel} out`;
|
||||
};
|
||||
|
||||
const formatMediaUnderstandingLine = (decisions?: MediaUnderstandingDecision[]) => {
|
||||
if (!decisions || decisions.length === 0) return null;
|
||||
const parts = decisions
|
||||
.map((decision) => {
|
||||
const count = decision.attachments.length;
|
||||
const countLabel = count > 1 ? ` x${count}` : "";
|
||||
if (decision.outcome === "success") {
|
||||
const chosen = decision.attachments.find((entry) => entry.chosen)?.chosen;
|
||||
const provider = chosen?.provider?.trim();
|
||||
const model = chosen?.model?.trim();
|
||||
const modelLabel = provider ? (model ? `${provider}/${model}` : provider) : null;
|
||||
return `${decision.capability}${countLabel} ok${modelLabel ? ` (${modelLabel})` : ""}`;
|
||||
}
|
||||
if (decision.outcome === "no-attachment") {
|
||||
return `${decision.capability} none`;
|
||||
}
|
||||
if (decision.outcome === "disabled") {
|
||||
return `${decision.capability} off`;
|
||||
}
|
||||
if (decision.outcome === "scope-deny") {
|
||||
return `${decision.capability} denied`;
|
||||
}
|
||||
if (decision.outcome === "skipped") {
|
||||
const reason = decision.attachments
|
||||
.flatMap((entry) => entry.attempts.map((attempt) => attempt.reason).filter(Boolean))
|
||||
.find(Boolean);
|
||||
const shortReason = reason ? reason.split(":")[0]?.trim() : undefined;
|
||||
return `${decision.capability} skipped${shortReason ? ` (${shortReason})` : ""}`;
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.filter(Boolean);
|
||||
if (parts.length === 0) return null;
|
||||
return `📎 Media: ${parts.join(" · ")}`;
|
||||
};
|
||||
|
||||
export function buildStatusMessage(args: StatusArgs): string {
|
||||
const now = args.now ?? Date.now();
|
||||
const entry = args.sessionEntry;
|
||||
@@ -320,12 +358,14 @@ export function buildStatusMessage(args: StatusArgs): string {
|
||||
const costLine = costLabel ? `💵 Cost: ${costLabel}` : null;
|
||||
const usageCostLine =
|
||||
usagePair && costLine ? `${usagePair} · ${costLine}` : (usagePair ?? costLine);
|
||||
const mediaLine = formatMediaUnderstandingLine(args.mediaDecisions);
|
||||
|
||||
return [
|
||||
versionLine,
|
||||
modelLine,
|
||||
usageCostLine,
|
||||
`📚 ${contextLine}`,
|
||||
mediaLine,
|
||||
args.usageLine,
|
||||
`🧵 ${sessionLine}`,
|
||||
`⚙️ ${optionsLine}`,
|
||||
|
||||
@@ -1,52 +1,26 @@
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import { applyTemplate } from "../auto-reply/templating.js";
|
||||
import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
|
||||
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
import type {
|
||||
MediaUnderstandingConfig,
|
||||
MediaUnderstandingModelConfig,
|
||||
} from "../config/types.tools.js";
|
||||
import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js";
|
||||
import {
|
||||
CLI_OUTPUT_MAX_BUFFER,
|
||||
DEFAULT_AUDIO_MODELS,
|
||||
DEFAULT_TIMEOUT_SECONDS,
|
||||
} from "./defaults.js";
|
||||
import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js";
|
||||
import {
|
||||
extractMediaUserText,
|
||||
formatAudioTranscripts,
|
||||
formatMediaUnderstandingBody,
|
||||
} from "./format.js";
|
||||
import {
|
||||
buildMediaUnderstandingRegistry,
|
||||
getMediaUnderstandingProvider,
|
||||
normalizeMediaProviderId,
|
||||
} from "./providers/index.js";
|
||||
import { describeImageWithModel } from "./providers/image.js";
|
||||
import {
|
||||
resolveCapabilityConfig,
|
||||
inferProviderCapabilities,
|
||||
resolveConcurrency,
|
||||
resolveMaxBytes,
|
||||
resolveMaxChars,
|
||||
resolveModelEntries,
|
||||
resolvePrompt,
|
||||
resolveScopeDecision,
|
||||
resolveTimeoutMs,
|
||||
} from "./resolve.js";
|
||||
import type {
|
||||
MediaUnderstandingCapability,
|
||||
MediaUnderstandingDecision,
|
||||
MediaUnderstandingModelDecision,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
} from "./types.js";
|
||||
import { runWithConcurrency } from "./concurrency.js";
|
||||
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
|
||||
import { resolveConcurrency } from "./resolve.js";
|
||||
import {
|
||||
type ActiveMediaModel,
|
||||
buildProviderRegistry,
|
||||
createMediaAttachmentCache,
|
||||
normalizeMediaAttachments,
|
||||
runCapability,
|
||||
} from "./runner.js";
|
||||
|
||||
export type ApplyMediaUnderstandingResult = {
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
@@ -58,476 +32,6 @@ export type ApplyMediaUnderstandingResult = {
|
||||
|
||||
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
||||
|
||||
type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
function trimOutput(text: string, maxChars?: number): string {
|
||||
const trimmed = text.trim();
|
||||
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||
return trimmed.slice(0, maxChars).trim();
|
||||
}
|
||||
|
||||
function resolveEntriesWithActiveFallback(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
capability: MediaUnderstandingCapability;
|
||||
config?: MediaUnderstandingConfig;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): MediaUnderstandingModelConfig[] {
|
||||
const entries = resolveModelEntries({
|
||||
cfg: params.cfg,
|
||||
capability: params.capability,
|
||||
config: params.config,
|
||||
});
|
||||
if (entries.length > 0) return entries;
|
||||
if (params.config?.enabled !== true) return entries;
|
||||
const activeProvider = params.activeModel?.provider?.trim();
|
||||
if (!activeProvider) return entries;
|
||||
const capabilities = inferProviderCapabilities(activeProvider);
|
||||
if (!capabilities || !capabilities.includes(params.capability)) return entries;
|
||||
return [
|
||||
{
|
||||
type: "provider",
|
||||
provider: activeProvider,
|
||||
model: params.activeModel?.model,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
function buildModelDecision(params: {
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
entryType: "provider" | "cli";
|
||||
outcome: MediaUnderstandingModelDecision["outcome"];
|
||||
reason?: string;
|
||||
}): MediaUnderstandingModelDecision {
|
||||
if (params.entryType === "cli") {
|
||||
const command = params.entry.command?.trim();
|
||||
return {
|
||||
type: "cli",
|
||||
provider: command ?? "cli",
|
||||
model: params.entry.model ?? command,
|
||||
outcome: params.outcome,
|
||||
reason: params.reason,
|
||||
};
|
||||
}
|
||||
const providerIdRaw = params.entry.provider?.trim();
|
||||
const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined;
|
||||
return {
|
||||
type: "provider",
|
||||
provider: providerId ?? providerIdRaw,
|
||||
model: params.entry.model,
|
||||
outcome: params.outcome,
|
||||
reason: params.reason,
|
||||
};
|
||||
}
|
||||
|
||||
async function runProviderEntry(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
cache: MediaAttachmentCache;
|
||||
agentDir?: string;
|
||||
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { entry, capability, cfg } = params;
|
||||
const providerIdRaw = entry.provider?.trim();
|
||||
if (!providerIdRaw) {
|
||||
throw new Error(`Provider entry missing provider for ${capability}`);
|
||||
}
|
||||
const providerId = normalizeMediaProviderId(providerIdRaw);
|
||||
const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
|
||||
const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
|
||||
const timeoutMs = resolveTimeoutMs(
|
||||
entry.timeoutSeconds ??
|
||||
params.config?.timeoutSeconds ??
|
||||
cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||
);
|
||||
const prompt = resolvePrompt(
|
||||
capability,
|
||||
entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||
maxChars,
|
||||
);
|
||||
|
||||
if (capability === "image") {
|
||||
if (!params.agentDir) {
|
||||
throw new Error("Image understanding requires agentDir");
|
||||
}
|
||||
const modelId = entry.model?.trim();
|
||||
if (!modelId) {
|
||||
throw new Error("Image understanding requires model id");
|
||||
}
|
||||
const media = await params.cache.getBuffer({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
const result = provider?.describeImage
|
||||
? await provider.describeImage({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
model: modelId,
|
||||
provider: providerId,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
profile: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
cfg: params.cfg,
|
||||
})
|
||||
: await describeImageWithModel({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
model: modelId,
|
||||
provider: providerId,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
profile: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
cfg: params.cfg,
|
||||
});
|
||||
return {
|
||||
kind: "image.description",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? modelId,
|
||||
};
|
||||
}
|
||||
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
if (!provider) {
|
||||
throw new Error(`Media provider not available: ${providerId}`);
|
||||
}
|
||||
|
||||
if (capability === "audio") {
|
||||
if (!provider.transcribeAudio) {
|
||||
throw new Error(`Audio transcription provider "${providerId}" not available.`);
|
||||
}
|
||||
const media = await params.cache.getBuffer({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
const key = await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg,
|
||||
profileId: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
const providerConfig = cfg.models?.providers?.[providerId];
|
||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||
const result = await provider.transcribeAudio({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
apiKey: key.apiKey,
|
||||
baseUrl: providerConfig?.baseUrl,
|
||||
headers: providerConfig?.headers,
|
||||
model,
|
||||
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? model,
|
||||
};
|
||||
}
|
||||
|
||||
if (!provider.describeVideo) {
|
||||
throw new Error(`Video understanding provider "${providerId}" not available.`);
|
||||
}
|
||||
const media = await params.cache.getBuffer({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
const estimatedBase64Bytes = estimateBase64Size(media.size);
|
||||
const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
|
||||
if (estimatedBase64Bytes > maxBase64Bytes) {
|
||||
throw new MediaUnderstandingSkipError(
|
||||
"maxBytes",
|
||||
`Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
|
||||
);
|
||||
}
|
||||
const key = await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg,
|
||||
profileId: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
const providerConfig = cfg.models?.providers?.[providerId];
|
||||
const result = await provider.describeVideo({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
apiKey: key.apiKey,
|
||||
baseUrl: providerConfig?.baseUrl,
|
||||
headers: providerConfig?.headers,
|
||||
model: entry.model,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
kind: "video.description",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? entry.model,
|
||||
};
|
||||
}
|
||||
|
||||
async function runCliEntry(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
cache: MediaAttachmentCache;
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { entry, capability, cfg, ctx } = params;
|
||||
const command = entry.command?.trim();
|
||||
const args = entry.args ?? [];
|
||||
if (!command) {
|
||||
throw new Error(`CLI entry missing command for ${capability}`);
|
||||
}
|
||||
const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
|
||||
const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
|
||||
const timeoutMs = resolveTimeoutMs(
|
||||
entry.timeoutSeconds ??
|
||||
params.config?.timeoutSeconds ??
|
||||
cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||
);
|
||||
const prompt = resolvePrompt(
|
||||
capability,
|
||||
entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||
maxChars,
|
||||
);
|
||||
const pathResult = await params.cache.getPath({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
|
||||
const templCtx: MsgContext = {
|
||||
...ctx,
|
||||
MediaPath: pathResult.path,
|
||||
Prompt: prompt,
|
||||
MaxChars: maxChars,
|
||||
};
|
||||
const argv = [command, ...args].map((part, index) =>
|
||||
index === 0 ? part : applyTemplate(part, templCtx),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
|
||||
}
|
||||
const { stdout } = await runExec(argv[0], argv.slice(1), {
|
||||
timeoutMs,
|
||||
maxBuffer: CLI_OUTPUT_MAX_BUFFER,
|
||||
});
|
||||
const text = trimOutput(stdout, maxChars);
|
||||
if (!text) return null;
|
||||
return {
|
||||
kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text,
|
||||
provider: "cli",
|
||||
model: command,
|
||||
};
|
||||
}
|
||||
|
||||
async function runAttachmentEntries(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
agentDir?: string;
|
||||
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||
cache: MediaAttachmentCache;
|
||||
entries: MediaUnderstandingModelConfig[];
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> {
|
||||
const { entries, capability } = params;
|
||||
const attempts: MediaUnderstandingModelDecision[] = [];
|
||||
for (const entry of entries) {
|
||||
try {
|
||||
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||
const result =
|
||||
entryType === "cli"
|
||||
? await runCliEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
config: params.config,
|
||||
})
|
||||
: await runProviderEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
config: params.config,
|
||||
});
|
||||
if (result) {
|
||||
const decision = buildModelDecision({ entry, entryType, outcome: "success" });
|
||||
if (result.provider) decision.provider = result.provider;
|
||||
if (result.model) decision.model = result.model;
|
||||
attempts.push(decision);
|
||||
return { output: result, attempts };
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
|
||||
);
|
||||
} catch (err) {
|
||||
if (isMediaUnderstandingSkipError(err)) {
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType: entry.type ?? (entry.command ? "cli" : "provider"),
|
||||
outcome: "skipped",
|
||||
reason: `${err.reason}: ${err.message}`,
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType: entry.type ?? (entry.command ? "cli" : "provider"),
|
||||
outcome: "failed",
|
||||
reason: String(err),
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { output: null, attempts };
|
||||
}
|
||||
|
||||
async function runCapability(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachments: MediaAttachmentCache;
|
||||
media: ReturnType<typeof normalizeAttachments>;
|
||||
agentDir?: string;
|
||||
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||
config?: MediaUnderstandingConfig;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<{ outputs: MediaUnderstandingOutput[]; decision: MediaUnderstandingDecision }> {
|
||||
const { capability, cfg, ctx } = params;
|
||||
const config = params.config ?? resolveCapabilityConfig(cfg, capability);
|
||||
if (config?.enabled === false) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "disabled", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const attachmentPolicy = config?.attachments;
|
||||
const selected = selectAttachments({
|
||||
capability,
|
||||
attachments: params.media,
|
||||
policy: attachmentPolicy,
|
||||
});
|
||||
if (selected.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "no-attachment", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
||||
if (scopeDecision === "deny") {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||
}
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "scope-deny",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const entries = resolveEntriesWithActiveFallback({
|
||||
cfg,
|
||||
capability,
|
||||
config,
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
if (entries.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const outputs: MediaUnderstandingOutput[] = [];
|
||||
const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
|
||||
for (const attachment of selected) {
|
||||
const { output, attempts } = await runAttachmentEntries({
|
||||
capability,
|
||||
cfg,
|
||||
ctx,
|
||||
attachmentIndex: attachment.index,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
cache: params.attachments,
|
||||
entries,
|
||||
config,
|
||||
});
|
||||
if (output) outputs.push(output);
|
||||
attachmentDecisions.push({
|
||||
attachmentIndex: attachment.index,
|
||||
attempts,
|
||||
chosen: attempts.find((attempt) => attempt.outcome === "success"),
|
||||
});
|
||||
}
|
||||
return {
|
||||
outputs,
|
||||
decision: {
|
||||
capability,
|
||||
outcome: outputs.length > 0 ? "success" : "skipped",
|
||||
attachments: attachmentDecisions,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
export async function applyMediaUnderstanding(params: {
|
||||
ctx: MsgContext;
|
||||
cfg: ClawdbotConfig;
|
||||
@@ -542,13 +46,13 @@ export async function applyMediaUnderstanding(params: {
|
||||
.map((value) => extractMediaUserText(value))
|
||||
.find((value) => value && value.trim()) ?? undefined;
|
||||
|
||||
const attachments = normalizeAttachments(ctx);
|
||||
const providerRegistry = buildMediaUnderstandingRegistry(params.providers);
|
||||
const cache = new MediaAttachmentCache(attachments);
|
||||
const attachments = normalizeMediaAttachments(ctx);
|
||||
const providerRegistry = buildProviderRegistry(params.providers);
|
||||
const cache = createMediaAttachmentCache(attachments);
|
||||
|
||||
try {
|
||||
const tasks = CAPABILITY_ORDER.map((capability) => async () => {
|
||||
const config = resolveCapabilityConfig(cfg, capability);
|
||||
const config = cfg.tools?.media?.[capability];
|
||||
return await runCapability({
|
||||
capability,
|
||||
cfg,
|
||||
@@ -565,17 +69,12 @@ export async function applyMediaUnderstanding(params: {
|
||||
const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
|
||||
const outputs: MediaUnderstandingOutput[] = [];
|
||||
const decisions: MediaUnderstandingDecision[] = [];
|
||||
for (const [index] of CAPABILITY_ORDER.entries()) {
|
||||
const entry = results[index];
|
||||
for (const entry of results) {
|
||||
if (!entry) continue;
|
||||
if (Array.isArray(entry.outputs)) {
|
||||
for (const output of entry.outputs) {
|
||||
outputs.push(output);
|
||||
}
|
||||
}
|
||||
if (entry.decision) {
|
||||
decisions.push(entry.decision);
|
||||
for (const output of entry.outputs) {
|
||||
outputs.push(output);
|
||||
}
|
||||
decisions.push(entry.decision);
|
||||
}
|
||||
|
||||
if (decisions.length > 0) {
|
||||
|
||||
@@ -3,5 +3,6 @@ import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const anthropicProvider: MediaUnderstandingProvider = {
|
||||
id: "anthropic",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
|
||||
@@ -4,6 +4,7 @@ import { describeGeminiVideo } from "./video.js";
|
||||
|
||||
export const googleProvider: MediaUnderstandingProvider = {
|
||||
id: "google",
|
||||
capabilities: ["image", "audio", "video"],
|
||||
describeImage: describeImageWithModel,
|
||||
describeVideo: describeGeminiVideo,
|
||||
};
|
||||
|
||||
@@ -5,6 +5,7 @@ const DEFAULT_GROQ_AUDIO_BASE_URL = "https://api.groq.com/openai/v1";
|
||||
|
||||
export const groqProvider: MediaUnderstandingProvider = {
|
||||
id: "groq",
|
||||
capabilities: ["audio"],
|
||||
transcribeAudio: (req) =>
|
||||
transcribeOpenAiCompatibleAudio({
|
||||
...req,
|
||||
|
||||
@@ -29,7 +29,16 @@ export function buildMediaUnderstandingRegistry(
|
||||
}
|
||||
if (overrides) {
|
||||
for (const [key, provider] of Object.entries(overrides)) {
|
||||
registry.set(normalizeMediaProviderId(key), provider);
|
||||
const normalizedKey = normalizeMediaProviderId(key);
|
||||
const existing = registry.get(normalizedKey);
|
||||
const merged = existing
|
||||
? {
|
||||
...existing,
|
||||
...provider,
|
||||
capabilities: provider.capabilities ?? existing.capabilities,
|
||||
}
|
||||
: provider;
|
||||
registry.set(normalizedKey, merged);
|
||||
}
|
||||
}
|
||||
return registry;
|
||||
|
||||
@@ -3,5 +3,6 @@ import { describeImageWithModel } from "../image.js";
|
||||
|
||||
export const minimaxProvider: MediaUnderstandingProvider = {
|
||||
id: "minimax",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
};
|
||||
|
||||
@@ -4,6 +4,7 @@ import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||
|
||||
export const openaiProvider: MediaUnderstandingProvider = {
|
||||
id: "openai",
|
||||
capabilities: ["image"],
|
||||
describeImage: describeImageWithModel,
|
||||
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||
};
|
||||
|
||||
@@ -77,36 +77,22 @@ export function resolveScopeDecision(params: {
|
||||
});
|
||||
}
|
||||
|
||||
export function inferProviderCapabilities(
|
||||
providerId?: string,
|
||||
): MediaUnderstandingCapability[] | undefined {
|
||||
const provider = normalizeMediaProviderId(providerId ?? "");
|
||||
if (!provider) return undefined;
|
||||
if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
|
||||
return ["image"];
|
||||
}
|
||||
if (provider === "google") {
|
||||
return ["image", "audio", "video"];
|
||||
}
|
||||
if (provider === "groq") {
|
||||
return ["audio"];
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function inferCapabilities(
|
||||
entry: MediaUnderstandingModelConfig,
|
||||
): MediaUnderstandingCapability[] | undefined {
|
||||
if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
|
||||
return undefined;
|
||||
}
|
||||
return inferProviderCapabilities(entry.provider);
|
||||
function resolveEntryCapabilities(params: {
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
providerRegistry: Map<string, { capabilities?: MediaUnderstandingCapability[] }>;
|
||||
}): MediaUnderstandingCapability[] | undefined {
|
||||
const entryType = params.entry.type ?? (params.entry.command ? "cli" : "provider");
|
||||
if (entryType === "cli") return undefined;
|
||||
const providerId = normalizeMediaProviderId(params.entry.provider ?? "");
|
||||
if (!providerId) return undefined;
|
||||
return params.providerRegistry.get(providerId)?.capabilities;
|
||||
}
|
||||
|
||||
export function resolveModelEntries(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
capability: MediaUnderstandingCapability;
|
||||
config?: MediaUnderstandingConfig;
|
||||
providerRegistry: Map<string, { capabilities?: MediaUnderstandingCapability[] }>;
|
||||
}): MediaUnderstandingModelConfig[] {
|
||||
const { cfg, capability, config } = params;
|
||||
const sharedModels = cfg.tools?.media?.models ?? [];
|
||||
@@ -122,7 +108,7 @@ export function resolveModelEntries(params: {
|
||||
entry.capabilities && entry.capabilities.length > 0
|
||||
? entry.capabilities
|
||||
: source === "shared"
|
||||
? inferCapabilities(entry)
|
||||
? resolveEntryCapabilities({ entry, providerRegistry: params.providerRegistry })
|
||||
: undefined;
|
||||
if (!caps || caps.length === 0) {
|
||||
if (source === "shared") {
|
||||
@@ -148,13 +134,32 @@ export function resolveConcurrency(cfg: ClawdbotConfig): number {
|
||||
return DEFAULT_MEDIA_CONCURRENCY;
|
||||
}
|
||||
|
||||
export function resolveCapabilityEnabled(params: {
|
||||
export function resolveEntriesWithActiveFallback(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
capability: MediaUnderstandingCapability;
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): boolean {
|
||||
if (params.config?.enabled === false) return false;
|
||||
const sharedModels = params.cfg.tools?.media?.models ?? [];
|
||||
const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0;
|
||||
if (!hasModels) return false;
|
||||
return true;
|
||||
providerRegistry: Map<string, { capabilities?: MediaUnderstandingCapability[] }>;
|
||||
activeModel?: { provider: string; model?: string };
|
||||
}): MediaUnderstandingModelConfig[] {
|
||||
const entries = resolveModelEntries({
|
||||
cfg: params.cfg,
|
||||
capability: params.capability,
|
||||
config: params.config,
|
||||
providerRegistry: params.providerRegistry,
|
||||
});
|
||||
if (entries.length > 0) return entries;
|
||||
if (params.config?.enabled !== true) return entries;
|
||||
const activeProviderRaw = params.activeModel?.provider?.trim();
|
||||
if (!activeProviderRaw) return entries;
|
||||
const activeProvider = normalizeMediaProviderId(activeProviderRaw);
|
||||
if (!activeProvider) return entries;
|
||||
const capabilities = params.providerRegistry.get(activeProvider)?.capabilities;
|
||||
if (!capabilities || !capabilities.includes(params.capability)) return entries;
|
||||
return [
|
||||
{
|
||||
type: "provider",
|
||||
provider: activeProvider,
|
||||
model: params.activeModel?.model,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
506
src/media-understanding/runner.ts
Normal file
506
src/media-understanding/runner.ts
Normal file
@@ -0,0 +1,506 @@
|
||||
import type { ClawdbotConfig } from "../config/config.js";
|
||||
import type { MsgContext } from "../auto-reply/templating.js";
|
||||
import { applyTemplate } from "../auto-reply/templating.js";
|
||||
import { resolveApiKeyForProvider } from "../agents/model-auth.js";
|
||||
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||
import { runExec } from "../process/exec.js";
|
||||
import type {
|
||||
MediaUnderstandingConfig,
|
||||
MediaUnderstandingModelConfig,
|
||||
} from "../config/types.tools.js";
|
||||
import { MediaAttachmentCache, normalizeAttachments, selectAttachments } from "./attachments.js";
|
||||
import {
|
||||
CLI_OUTPUT_MAX_BUFFER,
|
||||
DEFAULT_AUDIO_MODELS,
|
||||
DEFAULT_TIMEOUT_SECONDS,
|
||||
} from "./defaults.js";
|
||||
import { isMediaUnderstandingSkipError, MediaUnderstandingSkipError } from "./errors.js";
|
||||
import {
|
||||
resolveEntriesWithActiveFallback,
|
||||
resolveMaxBytes,
|
||||
resolveMaxChars,
|
||||
resolvePrompt,
|
||||
resolveScopeDecision,
|
||||
resolveTimeoutMs,
|
||||
} from "./resolve.js";
|
||||
import type {
|
||||
MediaAttachment,
|
||||
MediaUnderstandingCapability,
|
||||
MediaUnderstandingDecision,
|
||||
MediaUnderstandingModelDecision,
|
||||
MediaUnderstandingOutput,
|
||||
MediaUnderstandingProvider,
|
||||
} from "./types.js";
|
||||
import {
|
||||
buildMediaUnderstandingRegistry,
|
||||
getMediaUnderstandingProvider,
|
||||
normalizeMediaProviderId,
|
||||
} from "./providers/index.js";
|
||||
import { describeImageWithModel } from "./providers/image.js";
|
||||
import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
|
||||
|
||||
export type ActiveMediaModel = {
|
||||
provider: string;
|
||||
model?: string;
|
||||
};
|
||||
|
||||
type ProviderRegistry = Map<string, MediaUnderstandingProvider>;
|
||||
|
||||
export type RunCapabilityResult = {
|
||||
outputs: MediaUnderstandingOutput[];
|
||||
decision: MediaUnderstandingDecision;
|
||||
};
|
||||
|
||||
export function buildProviderRegistry(
|
||||
overrides?: Record<string, MediaUnderstandingProvider>,
|
||||
): ProviderRegistry {
|
||||
return buildMediaUnderstandingRegistry(overrides);
|
||||
}
|
||||
|
||||
export function normalizeMediaAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||
return normalizeAttachments(ctx);
|
||||
}
|
||||
|
||||
export function createMediaAttachmentCache(attachments: MediaAttachment[]): MediaAttachmentCache {
|
||||
return new MediaAttachmentCache(attachments);
|
||||
}
|
||||
|
||||
function trimOutput(text: string, maxChars?: number): string {
|
||||
const trimmed = text.trim();
|
||||
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||
return trimmed.slice(0, maxChars).trim();
|
||||
}
|
||||
|
||||
function buildModelDecision(params: {
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
entryType: "provider" | "cli";
|
||||
outcome: MediaUnderstandingModelDecision["outcome"];
|
||||
reason?: string;
|
||||
}): MediaUnderstandingModelDecision {
|
||||
if (params.entryType === "cli") {
|
||||
const command = params.entry.command?.trim();
|
||||
return {
|
||||
type: "cli",
|
||||
provider: command ?? "cli",
|
||||
model: params.entry.model ?? command,
|
||||
outcome: params.outcome,
|
||||
reason: params.reason,
|
||||
};
|
||||
}
|
||||
const providerIdRaw = params.entry.provider?.trim();
|
||||
const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined;
|
||||
return {
|
||||
type: "provider",
|
||||
provider: providerId ?? providerIdRaw,
|
||||
model: params.entry.model,
|
||||
outcome: params.outcome,
|
||||
reason: params.reason,
|
||||
};
|
||||
}
|
||||
|
||||
async function runProviderEntry(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
cache: MediaAttachmentCache;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { entry, capability, cfg } = params;
|
||||
const providerIdRaw = entry.provider?.trim();
|
||||
if (!providerIdRaw) {
|
||||
throw new Error(`Provider entry missing provider for ${capability}`);
|
||||
}
|
||||
const providerId = normalizeMediaProviderId(providerIdRaw);
|
||||
const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
|
||||
const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
|
||||
const timeoutMs = resolveTimeoutMs(
|
||||
entry.timeoutSeconds ??
|
||||
params.config?.timeoutSeconds ??
|
||||
cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||
);
|
||||
const prompt = resolvePrompt(
|
||||
capability,
|
||||
entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||
maxChars,
|
||||
);
|
||||
|
||||
if (capability === "image") {
|
||||
if (!params.agentDir) {
|
||||
throw new Error("Image understanding requires agentDir");
|
||||
}
|
||||
const modelId = entry.model?.trim();
|
||||
if (!modelId) {
|
||||
throw new Error("Image understanding requires model id");
|
||||
}
|
||||
const media = await params.cache.getBuffer({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
const result = provider?.describeImage
|
||||
? await provider.describeImage({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
model: modelId,
|
||||
provider: providerId,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
profile: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
cfg: params.cfg,
|
||||
})
|
||||
: await describeImageWithModel({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
model: modelId,
|
||||
provider: providerId,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
profile: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
cfg: params.cfg,
|
||||
});
|
||||
return {
|
||||
kind: "image.description",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? modelId,
|
||||
};
|
||||
}
|
||||
|
||||
const provider = getMediaUnderstandingProvider(providerId, params.providerRegistry);
|
||||
if (!provider) {
|
||||
throw new Error(`Media provider not available: ${providerId}`);
|
||||
}
|
||||
|
||||
if (capability === "audio") {
|
||||
if (!provider.transcribeAudio) {
|
||||
throw new Error(`Audio transcription provider "${providerId}" not available.`);
|
||||
}
|
||||
const media = await params.cache.getBuffer({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
const key = await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg,
|
||||
profileId: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
const providerConfig = cfg.models?.providers?.[providerId];
|
||||
const model = entry.model?.trim() || DEFAULT_AUDIO_MODELS[providerId] || entry.model;
|
||||
const result = await provider.transcribeAudio({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
apiKey: key.apiKey,
|
||||
baseUrl: providerConfig?.baseUrl,
|
||||
headers: providerConfig?.headers,
|
||||
model,
|
||||
language: entry.language ?? params.config?.language ?? cfg.tools?.media?.audio?.language,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
kind: "audio.transcription",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? model,
|
||||
};
|
||||
}
|
||||
|
||||
if (!provider.describeVideo) {
|
||||
throw new Error(`Video understanding provider "${providerId}" not available.`);
|
||||
}
|
||||
const media = await params.cache.getBuffer({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
const estimatedBase64Bytes = estimateBase64Size(media.size);
|
||||
const maxBase64Bytes = resolveVideoMaxBase64Bytes(maxBytes);
|
||||
if (estimatedBase64Bytes > maxBase64Bytes) {
|
||||
throw new MediaUnderstandingSkipError(
|
||||
"maxBytes",
|
||||
`Video attachment ${params.attachmentIndex + 1} base64 payload ${estimatedBase64Bytes} exceeds ${maxBase64Bytes}`,
|
||||
);
|
||||
}
|
||||
const key = await resolveApiKeyForProvider({
|
||||
provider: providerId,
|
||||
cfg,
|
||||
profileId: entry.profile,
|
||||
preferredProfile: entry.preferredProfile,
|
||||
agentDir: params.agentDir,
|
||||
});
|
||||
const providerConfig = cfg.models?.providers?.[providerId];
|
||||
const result = await provider.describeVideo({
|
||||
buffer: media.buffer,
|
||||
fileName: media.fileName,
|
||||
mime: media.mime,
|
||||
apiKey: key.apiKey,
|
||||
baseUrl: providerConfig?.baseUrl,
|
||||
headers: providerConfig?.headers,
|
||||
model: entry.model,
|
||||
prompt,
|
||||
timeoutMs,
|
||||
});
|
||||
return {
|
||||
kind: "video.description",
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text: trimOutput(result.text, maxChars),
|
||||
provider: providerId,
|
||||
model: result.model ?? entry.model,
|
||||
};
|
||||
}
|
||||
|
||||
async function runCliEntry(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
entry: MediaUnderstandingModelConfig;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
cache: MediaAttachmentCache;
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<MediaUnderstandingOutput | null> {
|
||||
const { entry, capability, cfg, ctx } = params;
|
||||
const command = entry.command?.trim();
|
||||
const args = entry.args ?? [];
|
||||
if (!command) {
|
||||
throw new Error(`CLI entry missing command for ${capability}`);
|
||||
}
|
||||
const maxBytes = resolveMaxBytes({ capability, entry, cfg, config: params.config });
|
||||
const maxChars = resolveMaxChars({ capability, entry, cfg, config: params.config });
|
||||
const timeoutMs = resolveTimeoutMs(
|
||||
entry.timeoutSeconds ??
|
||||
params.config?.timeoutSeconds ??
|
||||
cfg.tools?.media?.[capability]?.timeoutSeconds,
|
||||
DEFAULT_TIMEOUT_SECONDS[capability],
|
||||
);
|
||||
const prompt = resolvePrompt(
|
||||
capability,
|
||||
entry.prompt ?? params.config?.prompt ?? cfg.tools?.media?.[capability]?.prompt,
|
||||
maxChars,
|
||||
);
|
||||
const pathResult = await params.cache.getPath({
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
maxBytes,
|
||||
timeoutMs,
|
||||
});
|
||||
|
||||
const templCtx: MsgContext = {
|
||||
...ctx,
|
||||
MediaPath: pathResult.path,
|
||||
Prompt: prompt,
|
||||
MaxChars: maxChars,
|
||||
};
|
||||
const argv = [command, ...args].map((part, index) =>
|
||||
index === 0 ? part : applyTemplate(part, templCtx),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Media understanding via CLI: ${argv.join(" ")}`);
|
||||
}
|
||||
const { stdout } = await runExec(argv[0], argv.slice(1), {
|
||||
timeoutMs,
|
||||
maxBuffer: CLI_OUTPUT_MAX_BUFFER,
|
||||
});
|
||||
const text = trimOutput(stdout, maxChars);
|
||||
if (!text) return null;
|
||||
return {
|
||||
kind: capability === "audio" ? "audio.transcription" : `${capability}.description`,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
text,
|
||||
provider: "cli",
|
||||
model: command,
|
||||
};
|
||||
}
|
||||
|
||||
async function runAttachmentEntries(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachmentIndex: number;
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
cache: MediaAttachmentCache;
|
||||
entries: MediaUnderstandingModelConfig[];
|
||||
config?: MediaUnderstandingConfig;
|
||||
}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> {
|
||||
const { entries, capability } = params;
|
||||
const attempts: MediaUnderstandingModelDecision[] = [];
|
||||
for (const entry of entries) {
|
||||
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||
try {
|
||||
const result =
|
||||
entryType === "cli"
|
||||
? await runCliEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
config: params.config,
|
||||
})
|
||||
: await runProviderEntry({
|
||||
capability,
|
||||
entry,
|
||||
cfg: params.cfg,
|
||||
ctx: params.ctx,
|
||||
attachmentIndex: params.attachmentIndex,
|
||||
cache: params.cache,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
config: params.config,
|
||||
});
|
||||
if (result) {
|
||||
const decision = buildModelDecision({ entry, entryType, outcome: "success" });
|
||||
if (result.provider) decision.provider = result.provider;
|
||||
if (result.model) decision.model = result.model;
|
||||
attempts.push(decision);
|
||||
return { output: result, attempts };
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
|
||||
);
|
||||
} catch (err) {
|
||||
if (isMediaUnderstandingSkipError(err)) {
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType,
|
||||
outcome: "skipped",
|
||||
reason: `${err.reason}: ${err.message}`,
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
attempts.push(
|
||||
buildModelDecision({
|
||||
entry,
|
||||
entryType,
|
||||
outcome: "failed",
|
||||
reason: String(err),
|
||||
}),
|
||||
);
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { output: null, attempts };
|
||||
}
|
||||
|
||||
export async function runCapability(params: {
|
||||
capability: MediaUnderstandingCapability;
|
||||
cfg: ClawdbotConfig;
|
||||
ctx: MsgContext;
|
||||
attachments: MediaAttachmentCache;
|
||||
media: MediaAttachment[];
|
||||
agentDir?: string;
|
||||
providerRegistry: ProviderRegistry;
|
||||
config?: MediaUnderstandingConfig;
|
||||
activeModel?: ActiveMediaModel;
|
||||
}): Promise<RunCapabilityResult> {
|
||||
const { capability, cfg, ctx } = params;
|
||||
const config = params.config ?? cfg.tools?.media?.[capability];
|
||||
if (config?.enabled === false) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "disabled", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const attachmentPolicy = config?.attachments;
|
||||
const selected = selectAttachments({
|
||||
capability,
|
||||
attachments: params.media,
|
||||
policy: attachmentPolicy,
|
||||
});
|
||||
if (selected.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: { capability, outcome: "no-attachment", attachments: [] },
|
||||
};
|
||||
}
|
||||
|
||||
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
||||
if (scopeDecision === "deny") {
|
||||
if (shouldLogVerbose()) {
|
||||
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||
}
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "scope-deny",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const entries = resolveEntriesWithActiveFallback({
|
||||
cfg,
|
||||
capability,
|
||||
config,
|
||||
providerRegistry: params.providerRegistry,
|
||||
activeModel: params.activeModel,
|
||||
});
|
||||
if (entries.length === 0) {
|
||||
return {
|
||||
outputs: [],
|
||||
decision: {
|
||||
capability,
|
||||
outcome: "skipped",
|
||||
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
const outputs: MediaUnderstandingOutput[] = [];
|
||||
const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
|
||||
for (const attachment of selected) {
|
||||
const { output, attempts } = await runAttachmentEntries({
|
||||
capability,
|
||||
cfg,
|
||||
ctx,
|
||||
attachmentIndex: attachment.index,
|
||||
agentDir: params.agentDir,
|
||||
providerRegistry: params.providerRegistry,
|
||||
cache: params.attachments,
|
||||
entries,
|
||||
config,
|
||||
});
|
||||
if (output) outputs.push(output);
|
||||
attachmentDecisions.push({
|
||||
attachmentIndex: attachment.index,
|
||||
attempts,
|
||||
chosen: attempts.find((attempt) => attempt.outcome === "success"),
|
||||
});
|
||||
}
|
||||
return {
|
||||
outputs,
|
||||
decision: {
|
||||
capability,
|
||||
outcome: outputs.length > 0 ? "success" : "skipped",
|
||||
attachments: attachmentDecisions,
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -106,6 +106,7 @@ export type ImageDescriptionResult = {
|
||||
|
||||
export type MediaUnderstandingProvider = {
|
||||
id: string;
|
||||
capabilities?: MediaUnderstandingCapability[];
|
||||
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
||||
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
||||
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
|
||||
|
||||
Reference in New Issue
Block a user