diff --git a/docs/nodes/media-understanding.md b/docs/nodes/media-understanding.md index 12f4c2317..b9d2abdc1 100644 --- a/docs/nodes/media-understanding.md +++ b/docs/nodes/media-understanding.md @@ -21,7 +21,8 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t 4) If a model fails or the media is too large, **fall back to the next entry**. 5) On success: - `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block. - - Audio sets `{{Transcript}}` and `CommandBody`/`RawBody` for command parsing. + - Audio sets `{{Transcript}}`; command parsing uses caption text when present, + otherwise the transcript. - Captions are preserved as `User text:` inside the block. If understanding fails or is disabled, **the reply flow continues** with the original body + attachments. @@ -98,6 +99,8 @@ Rules: - If media exceeds `maxBytes`, that model is skipped and the **next model is tried**. - If the model returns more than `maxChars`, output is trimmed. - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only). +- If `.enabled: true` but no models are configured, Clawdbot tries the + **active reply model** when its provider supports the capability. ## Capabilities (optional) If you set `capabilities`, the entry only runs for those media types. For shared diff --git a/src/auto-reply/reply/get-reply.ts b/src/auto-reply/reply/get-reply.ts index 648f5da6a..80bbea0cb 100644 --- a/src/auto-reply/reply/get-reply.ts +++ b/src/auto-reply/reply/get-reply.ts @@ -81,6 +81,7 @@ export async function getReplyFromConfig( ctx, cfg, agentDir, + activeModel: { provider, model }, }); const commandAuthorized = ctx.CommandAuthorized ?? true; diff --git a/src/auto-reply/templating.ts b/src/auto-reply/templating.ts index e3e7f65b6..0e0bc40b8 100644 --- a/src/auto-reply/templating.ts +++ b/src/auto-reply/templating.ts @@ -1,7 +1,10 @@ import type { ChannelId } from "../channels/plugins/types.js"; import type { InternalMessageChannel } from "../utils/message-channel.js"; import type { CommandArgs } from "./commands-registry.types.js"; -import type { MediaUnderstandingOutput } from "../media-understanding/types.js"; +import type { + MediaUnderstandingDecision, + MediaUnderstandingOutput, +} from "../media-understanding/types.js"; /** Valid message channels for routing. */ export type OriginatingChannelType = ChannelId | InternalMessageChannel; @@ -53,6 +56,7 @@ export type MsgContext = { MediaRemoteHost?: string; Transcript?: string; MediaUnderstanding?: MediaUnderstandingOutput[]; + MediaUnderstandingDecisions?: MediaUnderstandingDecision[]; Prompt?: string; MaxChars?: number; ChatType?: string; diff --git a/src/media-understanding/apply.test.ts b/src/media-understanding/apply.test.ts index 47cd69598..d2ce7141e 100644 --- a/src/media-understanding/apply.test.ts +++ b/src/media-understanding/apply.test.ts @@ -85,6 +85,50 @@ describe("applyMediaUnderstanding", () => { expect(ctx.BodyForCommands).toBe("transcribed text"); }); + it("keeps caption for command parsing when audio has user text", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const audioPath = path.join(dir, "note.ogg"); + await fs.writeFile(audioPath, "hello"); + + const ctx: MsgContext = { + Body: " /capture status", + MediaPath: audioPath, + MediaType: "audio/ogg", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + maxBytes: 1024 * 1024, + models: [{ provider: "groq" }], + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + providers: { + groq: { + id: "groq", + transcribeAudio: async () => ({ text: "transcribed text" }), + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("transcribed text"); + expect(ctx.Body).toBe( + "[Audio]\nUser text:\n/capture status\nTranscript:\ntranscribed text", + ); + expect(ctx.CommandBody).toBe("/capture status"); + expect(ctx.RawBody).toBe("/capture status"); + expect(ctx.BodyForCommands).toBe("/capture status"); + }); + it("handles URL-only attachments for audio transcription", async () => { const { applyMediaUnderstanding } = await loadApply(); const ctx: MsgContext = { @@ -301,6 +345,43 @@ describe("applyMediaUnderstanding", () => { expect(ctx.Body).toBe("[Image]\nDescription:\nshared description"); }); + it("uses active model when enabled and models are missing", async () => { + const { applyMediaUnderstanding } = await loadApply(); + const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); + const audioPath = path.join(dir, "fallback.ogg"); + await fs.writeFile(audioPath, "hello"); + + const ctx: MsgContext = { + Body: "", + MediaPath: audioPath, + MediaType: "audio/ogg", + }; + const cfg: ClawdbotConfig = { + tools: { + media: { + audio: { + enabled: true, + }, + }, + }, + }; + + const result = await applyMediaUnderstanding({ + ctx, + cfg, + activeModel: { provider: "groq", model: "whisper-large-v3" }, + providers: { + groq: { + id: "groq", + transcribeAudio: async () => ({ text: "fallback transcript" }), + }, + }, + }); + + expect(result.appliedAudio).toBe(true); + expect(ctx.Transcript).toBe("fallback transcript"); + }); + it("handles multiple audio attachments when attachment mode is all", async () => { const { applyMediaUnderstanding } = await loadApply(); const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-")); diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index b6b0be25b..8a5bac74b 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -29,7 +29,7 @@ import { import { describeImageWithModel } from "./providers/image.js"; import { resolveCapabilityConfig, - resolveCapabilityEnabled, + inferProviderCapabilities, resolveConcurrency, resolveMaxBytes, resolveMaxChars, @@ -40,6 +40,8 @@ import { } from "./resolve.js"; import type { MediaUnderstandingCapability, + MediaUnderstandingDecision, + MediaUnderstandingModelDecision, MediaUnderstandingOutput, MediaUnderstandingProvider, } from "./types.js"; @@ -48,6 +50,7 @@ import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js"; export type ApplyMediaUnderstandingResult = { outputs: MediaUnderstandingOutput[]; + decisions: MediaUnderstandingDecision[]; appliedImage: boolean; appliedAudio: boolean; appliedVideo: boolean; @@ -55,12 +58,70 @@ export type ApplyMediaUnderstandingResult = { const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"]; +type ActiveMediaModel = { + provider: string; + model?: string; +}; + function trimOutput(text: string, maxChars?: number): string { const trimmed = text.trim(); if (!maxChars || trimmed.length <= maxChars) return trimmed; return trimmed.slice(0, maxChars).trim(); } +function resolveEntriesWithActiveFallback(params: { + cfg: ClawdbotConfig; + capability: MediaUnderstandingCapability; + config?: MediaUnderstandingConfig; + activeModel?: ActiveMediaModel; +}): MediaUnderstandingModelConfig[] { + const entries = resolveModelEntries({ + cfg: params.cfg, + capability: params.capability, + config: params.config, + }); + if (entries.length > 0) return entries; + if (params.config?.enabled !== true) return entries; + const activeProvider = params.activeModel?.provider?.trim(); + if (!activeProvider) return entries; + const capabilities = inferProviderCapabilities(activeProvider); + if (!capabilities || !capabilities.includes(params.capability)) return entries; + return [ + { + type: "provider", + provider: activeProvider, + model: params.activeModel?.model, + }, + ]; +} + +function buildModelDecision(params: { + entry: MediaUnderstandingModelConfig; + entryType: "provider" | "cli"; + outcome: MediaUnderstandingModelDecision["outcome"]; + reason?: string; +}): MediaUnderstandingModelDecision { + if (params.entryType === "cli") { + const command = params.entry.command?.trim(); + return { + type: "cli", + provider: command ?? "cli", + model: params.entry.model ?? command, + outcome: params.outcome, + reason: params.reason, + }; + } + const providerIdRaw = params.entry.provider?.trim(); + const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined; + return { + type: "provider", + provider: providerId ?? providerIdRaw, + model: params.entry.model, + outcome: params.outcome, + reason: params.reason, + }; +} + async function runProviderEntry(params: { capability: MediaUnderstandingCapability; entry: MediaUnderstandingModelConfig; @@ -301,8 +362,9 @@ async function runAttachmentEntries(params: { cache: MediaAttachmentCache; entries: MediaUnderstandingModelConfig[]; config?: MediaUnderstandingConfig; -}): Promise { +}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> { const { entries, capability } = params; + const attempts: MediaUnderstandingModelDecision[] = []; for (const entry of entries) { try { const entryType = entry.type ?? (entry.command ? "cli" : "provider"); @@ -328,21 +390,46 @@ async function runAttachmentEntries(params: { providerRegistry: params.providerRegistry, config: params.config, }); - if (result) return result; + if (result) { + const decision = buildModelDecision({ entry, entryType, outcome: "success" }); + if (result.provider) decision.provider = result.provider; + if (result.model) decision.model = result.model; + attempts.push(decision); + return { output: result, attempts }; + } + attempts.push( + buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }), + ); } catch (err) { if (isMediaUnderstandingSkipError(err)) { + attempts.push( + buildModelDecision({ + entry, + entryType: entry.type ?? (entry.command ? "cli" : "provider"), + outcome: "skipped", + reason: `${err.reason}: ${err.message}`, + }), + ); if (shouldLogVerbose()) { logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`); } continue; } + attempts.push( + buildModelDecision({ + entry, + entryType: entry.type ?? (entry.command ? "cli" : "provider"), + outcome: "failed", + reason: String(err), + }), + ); if (shouldLogVerbose()) { logVerbose(`${capability} understanding failed: ${String(err)}`); } } } - return null; + return { output: null, attempts }; } async function runCapability(params: { @@ -350,33 +437,74 @@ async function runCapability(params: { cfg: ClawdbotConfig; ctx: MsgContext; attachments: MediaAttachmentCache; - attachmentIds: number[]; + media: ReturnType; agentDir?: string; providerRegistry: Map; config?: MediaUnderstandingConfig; -}): Promise { + activeModel?: ActiveMediaModel; +}): Promise<{ outputs: MediaUnderstandingOutput[]; decision: MediaUnderstandingDecision }> { const { capability, cfg, ctx } = params; const config = params.config ?? resolveCapabilityConfig(cfg, capability); - if (!resolveCapabilityEnabled({ cfg, config })) return []; + if (config?.enabled === false) { + return { + outputs: [], + decision: { capability, outcome: "disabled", attachments: [] }, + }; + } - const entries = resolveModelEntries({ cfg, capability, config }); - if (entries.length === 0) return []; + const attachmentPolicy = config?.attachments; + const selected = selectAttachments({ + capability, + attachments: params.media, + policy: attachmentPolicy, + }); + if (selected.length === 0) { + return { + outputs: [], + decision: { capability, outcome: "no-attachment", attachments: [] }, + }; + } const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx }); if (scopeDecision === "deny") { if (shouldLogVerbose()) { logVerbose(`${capability} understanding disabled by scope policy.`); } - return []; + return { + outputs: [], + decision: { + capability, + outcome: "scope-deny", + attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), + }, + }; + } + + const entries = resolveEntriesWithActiveFallback({ + cfg, + capability, + config, + activeModel: params.activeModel, + }); + if (entries.length === 0) { + return { + outputs: [], + decision: { + capability, + outcome: "skipped", + attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })), + }, + }; } const outputs: MediaUnderstandingOutput[] = []; - for (const attachmentIndex of params.attachmentIds) { - const output = await runAttachmentEntries({ + const attachmentDecisions: MediaUnderstandingDecision["attachments"] = []; + for (const attachment of selected) { + const { output, attempts } = await runAttachmentEntries({ capability, cfg, ctx, - attachmentIndex, + attachmentIndex: attachment.index, agentDir: params.agentDir, providerRegistry: params.providerRegistry, cache: params.attachments, @@ -384,8 +512,20 @@ async function runCapability(params: { config, }); if (output) outputs.push(output); + attachmentDecisions.push({ + attachmentIndex: attachment.index, + attempts, + chosen: attempts.find((attempt) => attempt.outcome === "success"), + }); } - return outputs; + return { + outputs, + decision: { + capability, + outcome: outputs.length > 0 ? "success" : "skipped", + attachments: attachmentDecisions, + }, + }; } export async function applyMediaUnderstanding(params: { @@ -393,6 +533,7 @@ export async function applyMediaUnderstanding(params: { cfg: ClawdbotConfig; agentDir?: string; providers?: Record; + activeModel?: ActiveMediaModel; }): Promise { const { ctx, cfg } = params; const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body]; @@ -408,33 +549,40 @@ export async function applyMediaUnderstanding(params: { try { const tasks = CAPABILITY_ORDER.map((capability) => async () => { const config = resolveCapabilityConfig(cfg, capability); - const attachmentPolicy = config?.attachments; - const selected = selectAttachments({ - capability, - attachments, - policy: attachmentPolicy, - }); - if (selected.length === 0) return [] as MediaUnderstandingOutput[]; return await runCapability({ capability, cfg, ctx, attachments: cache, - attachmentIds: selected.map((item) => item.index), + media: attachments, agentDir: params.agentDir, providerRegistry, config, + activeModel: params.activeModel, }); }); const results = await runWithConcurrency(tasks, resolveConcurrency(cfg)); const outputs: MediaUnderstandingOutput[] = []; + const decisions: MediaUnderstandingDecision[] = []; for (const [index] of CAPABILITY_ORDER.entries()) { - const entries = results[index] ?? []; - if (!Array.isArray(entries)) continue; - for (const entry of entries) { - outputs.push(entry); + const entry = results[index]; + if (!entry) continue; + if (Array.isArray(entry.outputs)) { + for (const output of entry.outputs) { + outputs.push(output); + } } + if (entry.decision) { + decisions.push(entry.decision); + } + } + + if (decisions.length > 0) { + ctx.MediaUnderstandingDecisions = [ + ...(ctx.MediaUnderstandingDecisions ?? []), + ...decisions, + ]; } if (outputs.length > 0) { @@ -443,8 +591,13 @@ export async function applyMediaUnderstanding(params: { if (audioOutputs.length > 0) { const transcript = formatAudioTranscripts(audioOutputs); ctx.Transcript = transcript; - ctx.CommandBody = transcript; - ctx.RawBody = transcript; + if (originalUserText) { + ctx.CommandBody = originalUserText; + ctx.RawBody = originalUserText; + } else { + ctx.CommandBody = transcript; + ctx.RawBody = transcript; + } } else if (originalUserText) { ctx.CommandBody = originalUserText; ctx.RawBody = originalUserText; @@ -455,6 +608,7 @@ export async function applyMediaUnderstanding(params: { return { outputs, + decisions, appliedImage: outputs.some((output) => output.kind === "image.description"), appliedAudio: outputs.some((output) => output.kind === "audio.transcription"), appliedVideo: outputs.some((output) => output.kind === "video.description"), diff --git a/src/media-understanding/resolve.ts b/src/media-understanding/resolve.ts index a65a17044..bc34ce7d5 100644 --- a/src/media-understanding/resolve.ts +++ b/src/media-understanding/resolve.ts @@ -77,13 +77,10 @@ export function resolveScopeDecision(params: { }); } -function inferCapabilities( - entry: MediaUnderstandingModelConfig, +export function inferProviderCapabilities( + providerId?: string, ): MediaUnderstandingCapability[] | undefined { - if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") { - return ["image", "audio", "video"]; - } - const provider = normalizeMediaProviderId(entry.provider ?? ""); + const provider = normalizeMediaProviderId(providerId ?? ""); if (!provider) return undefined; if (provider === "openai" || provider === "anthropic" || provider === "minimax") { return ["image"]; @@ -97,6 +94,15 @@ function inferCapabilities( return undefined; } +function inferCapabilities( + entry: MediaUnderstandingModelConfig, +): MediaUnderstandingCapability[] | undefined { + if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") { + return undefined; + } + return inferProviderCapabilities(entry.provider); +} + export function resolveModelEntries(params: { cfg: ClawdbotConfig; capability: MediaUnderstandingCapability;