refactor: tune media understanding
This commit is contained in:
@@ -21,7 +21,8 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
|
|||||||
4) If a model fails or the media is too large, **fall back to the next entry**.
|
4) If a model fails or the media is too large, **fall back to the next entry**.
|
||||||
5) On success:
|
5) On success:
|
||||||
- `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block.
|
- `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block.
|
||||||
- Audio sets `{{Transcript}}` and `CommandBody`/`RawBody` for command parsing.
|
- Audio sets `{{Transcript}}`; command parsing uses caption text when present,
|
||||||
|
otherwise the transcript.
|
||||||
- Captions are preserved as `User text:` inside the block.
|
- Captions are preserved as `User text:` inside the block.
|
||||||
|
|
||||||
If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
|
If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
|
||||||
@@ -98,6 +99,8 @@ Rules:
|
|||||||
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
|
- If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
|
||||||
- If the model returns more than `maxChars`, output is trimmed.
|
- If the model returns more than `maxChars`, output is trimmed.
|
||||||
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
||||||
|
- If `<capability>.enabled: true` but no models are configured, Clawdbot tries the
|
||||||
|
**active reply model** when its provider supports the capability.
|
||||||
|
|
||||||
## Capabilities (optional)
|
## Capabilities (optional)
|
||||||
If you set `capabilities`, the entry only runs for those media types. For shared
|
If you set `capabilities`, the entry only runs for those media types. For shared
|
||||||
|
|||||||
@@ -81,6 +81,7 @@ export async function getReplyFromConfig(
|
|||||||
ctx,
|
ctx,
|
||||||
cfg,
|
cfg,
|
||||||
agentDir,
|
agentDir,
|
||||||
|
activeModel: { provider, model },
|
||||||
});
|
});
|
||||||
|
|
||||||
const commandAuthorized = ctx.CommandAuthorized ?? true;
|
const commandAuthorized = ctx.CommandAuthorized ?? true;
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import type { ChannelId } from "../channels/plugins/types.js";
|
import type { ChannelId } from "../channels/plugins/types.js";
|
||||||
import type { InternalMessageChannel } from "../utils/message-channel.js";
|
import type { InternalMessageChannel } from "../utils/message-channel.js";
|
||||||
import type { CommandArgs } from "./commands-registry.types.js";
|
import type { CommandArgs } from "./commands-registry.types.js";
|
||||||
import type { MediaUnderstandingOutput } from "../media-understanding/types.js";
|
import type {
|
||||||
|
MediaUnderstandingDecision,
|
||||||
|
MediaUnderstandingOutput,
|
||||||
|
} from "../media-understanding/types.js";
|
||||||
|
|
||||||
/** Valid message channels for routing. */
|
/** Valid message channels for routing. */
|
||||||
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
|
export type OriginatingChannelType = ChannelId | InternalMessageChannel;
|
||||||
@@ -53,6 +56,7 @@ export type MsgContext = {
|
|||||||
MediaRemoteHost?: string;
|
MediaRemoteHost?: string;
|
||||||
Transcript?: string;
|
Transcript?: string;
|
||||||
MediaUnderstanding?: MediaUnderstandingOutput[];
|
MediaUnderstanding?: MediaUnderstandingOutput[];
|
||||||
|
MediaUnderstandingDecisions?: MediaUnderstandingDecision[];
|
||||||
Prompt?: string;
|
Prompt?: string;
|
||||||
MaxChars?: number;
|
MaxChars?: number;
|
||||||
ChatType?: string;
|
ChatType?: string;
|
||||||
|
|||||||
@@ -85,6 +85,50 @@ describe("applyMediaUnderstanding", () => {
|
|||||||
expect(ctx.BodyForCommands).toBe("transcribed text");
|
expect(ctx.BodyForCommands).toBe("transcribed text");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("keeps caption for command parsing when audio has user text", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const audioPath = path.join(dir, "note.ogg");
|
||||||
|
await fs.writeFile(audioPath, "hello");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio> /capture status",
|
||||||
|
MediaPath: audioPath,
|
||||||
|
MediaType: "audio/ogg",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
maxBytes: 1024 * 1024,
|
||||||
|
models: [{ provider: "groq" }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
providers: {
|
||||||
|
groq: {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: async () => ({ text: "transcribed text" }),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(true);
|
||||||
|
expect(ctx.Transcript).toBe("transcribed text");
|
||||||
|
expect(ctx.Body).toBe(
|
||||||
|
"[Audio]\nUser text:\n/capture status\nTranscript:\ntranscribed text",
|
||||||
|
);
|
||||||
|
expect(ctx.CommandBody).toBe("/capture status");
|
||||||
|
expect(ctx.RawBody).toBe("/capture status");
|
||||||
|
expect(ctx.BodyForCommands).toBe("/capture status");
|
||||||
|
});
|
||||||
|
|
||||||
it("handles URL-only attachments for audio transcription", async () => {
|
it("handles URL-only attachments for audio transcription", async () => {
|
||||||
const { applyMediaUnderstanding } = await loadApply();
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
const ctx: MsgContext = {
|
const ctx: MsgContext = {
|
||||||
@@ -301,6 +345,43 @@ describe("applyMediaUnderstanding", () => {
|
|||||||
expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
|
expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("uses active model when enabled and models are missing", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const audioPath = path.join(dir, "fallback.ogg");
|
||||||
|
await fs.writeFile(audioPath, "hello");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio>",
|
||||||
|
MediaPath: audioPath,
|
||||||
|
MediaType: "audio/ogg",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
activeModel: { provider: "groq", model: "whisper-large-v3" },
|
||||||
|
providers: {
|
||||||
|
groq: {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: async () => ({ text: "fallback transcript" }),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(true);
|
||||||
|
expect(ctx.Transcript).toBe("fallback transcript");
|
||||||
|
});
|
||||||
|
|
||||||
it("handles multiple audio attachments when attachment mode is all", async () => {
|
it("handles multiple audio attachments when attachment mode is all", async () => {
|
||||||
const { applyMediaUnderstanding } = await loadApply();
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ import {
|
|||||||
import { describeImageWithModel } from "./providers/image.js";
|
import { describeImageWithModel } from "./providers/image.js";
|
||||||
import {
|
import {
|
||||||
resolveCapabilityConfig,
|
resolveCapabilityConfig,
|
||||||
resolveCapabilityEnabled,
|
inferProviderCapabilities,
|
||||||
resolveConcurrency,
|
resolveConcurrency,
|
||||||
resolveMaxBytes,
|
resolveMaxBytes,
|
||||||
resolveMaxChars,
|
resolveMaxChars,
|
||||||
@@ -40,6 +40,8 @@ import {
|
|||||||
} from "./resolve.js";
|
} from "./resolve.js";
|
||||||
import type {
|
import type {
|
||||||
MediaUnderstandingCapability,
|
MediaUnderstandingCapability,
|
||||||
|
MediaUnderstandingDecision,
|
||||||
|
MediaUnderstandingModelDecision,
|
||||||
MediaUnderstandingOutput,
|
MediaUnderstandingOutput,
|
||||||
MediaUnderstandingProvider,
|
MediaUnderstandingProvider,
|
||||||
} from "./types.js";
|
} from "./types.js";
|
||||||
@@ -48,6 +50,7 @@ import { estimateBase64Size, resolveVideoMaxBase64Bytes } from "./video.js";
|
|||||||
|
|
||||||
export type ApplyMediaUnderstandingResult = {
|
export type ApplyMediaUnderstandingResult = {
|
||||||
outputs: MediaUnderstandingOutput[];
|
outputs: MediaUnderstandingOutput[];
|
||||||
|
decisions: MediaUnderstandingDecision[];
|
||||||
appliedImage: boolean;
|
appliedImage: boolean;
|
||||||
appliedAudio: boolean;
|
appliedAudio: boolean;
|
||||||
appliedVideo: boolean;
|
appliedVideo: boolean;
|
||||||
@@ -55,12 +58,70 @@ export type ApplyMediaUnderstandingResult = {
|
|||||||
|
|
||||||
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
||||||
|
|
||||||
|
type ActiveMediaModel = {
|
||||||
|
provider: string;
|
||||||
|
model?: string;
|
||||||
|
};
|
||||||
|
|
||||||
function trimOutput(text: string, maxChars?: number): string {
|
function trimOutput(text: string, maxChars?: number): string {
|
||||||
const trimmed = text.trim();
|
const trimmed = text.trim();
|
||||||
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
if (!maxChars || trimmed.length <= maxChars) return trimmed;
|
||||||
return trimmed.slice(0, maxChars).trim();
|
return trimmed.slice(0, maxChars).trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function resolveEntriesWithActiveFallback(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
capability: MediaUnderstandingCapability;
|
||||||
|
config?: MediaUnderstandingConfig;
|
||||||
|
activeModel?: ActiveMediaModel;
|
||||||
|
}): MediaUnderstandingModelConfig[] {
|
||||||
|
const entries = resolveModelEntries({
|
||||||
|
cfg: params.cfg,
|
||||||
|
capability: params.capability,
|
||||||
|
config: params.config,
|
||||||
|
});
|
||||||
|
if (entries.length > 0) return entries;
|
||||||
|
if (params.config?.enabled !== true) return entries;
|
||||||
|
const activeProvider = params.activeModel?.provider?.trim();
|
||||||
|
if (!activeProvider) return entries;
|
||||||
|
const capabilities = inferProviderCapabilities(activeProvider);
|
||||||
|
if (!capabilities || !capabilities.includes(params.capability)) return entries;
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
type: "provider",
|
||||||
|
provider: activeProvider,
|
||||||
|
model: params.activeModel?.model,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildModelDecision(params: {
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
entryType: "provider" | "cli";
|
||||||
|
outcome: MediaUnderstandingModelDecision["outcome"];
|
||||||
|
reason?: string;
|
||||||
|
}): MediaUnderstandingModelDecision {
|
||||||
|
if (params.entryType === "cli") {
|
||||||
|
const command = params.entry.command?.trim();
|
||||||
|
return {
|
||||||
|
type: "cli",
|
||||||
|
provider: command ?? "cli",
|
||||||
|
model: params.entry.model ?? command,
|
||||||
|
outcome: params.outcome,
|
||||||
|
reason: params.reason,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
const providerIdRaw = params.entry.provider?.trim();
|
||||||
|
const providerId = providerIdRaw ? normalizeMediaProviderId(providerIdRaw) : undefined;
|
||||||
|
return {
|
||||||
|
type: "provider",
|
||||||
|
provider: providerId ?? providerIdRaw,
|
||||||
|
model: params.entry.model,
|
||||||
|
outcome: params.outcome,
|
||||||
|
reason: params.reason,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
async function runProviderEntry(params: {
|
async function runProviderEntry(params: {
|
||||||
capability: MediaUnderstandingCapability;
|
capability: MediaUnderstandingCapability;
|
||||||
entry: MediaUnderstandingModelConfig;
|
entry: MediaUnderstandingModelConfig;
|
||||||
@@ -301,8 +362,9 @@ async function runAttachmentEntries(params: {
|
|||||||
cache: MediaAttachmentCache;
|
cache: MediaAttachmentCache;
|
||||||
entries: MediaUnderstandingModelConfig[];
|
entries: MediaUnderstandingModelConfig[];
|
||||||
config?: MediaUnderstandingConfig;
|
config?: MediaUnderstandingConfig;
|
||||||
}): Promise<MediaUnderstandingOutput | null> {
|
}): Promise<{ output: MediaUnderstandingOutput | null; attempts: MediaUnderstandingModelDecision[] }> {
|
||||||
const { entries, capability } = params;
|
const { entries, capability } = params;
|
||||||
|
const attempts: MediaUnderstandingModelDecision[] = [];
|
||||||
for (const entry of entries) {
|
for (const entry of entries) {
|
||||||
try {
|
try {
|
||||||
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
const entryType = entry.type ?? (entry.command ? "cli" : "provider");
|
||||||
@@ -328,21 +390,46 @@ async function runAttachmentEntries(params: {
|
|||||||
providerRegistry: params.providerRegistry,
|
providerRegistry: params.providerRegistry,
|
||||||
config: params.config,
|
config: params.config,
|
||||||
});
|
});
|
||||||
if (result) return result;
|
if (result) {
|
||||||
|
const decision = buildModelDecision({ entry, entryType, outcome: "success" });
|
||||||
|
if (result.provider) decision.provider = result.provider;
|
||||||
|
if (result.model) decision.model = result.model;
|
||||||
|
attempts.push(decision);
|
||||||
|
return { output: result, attempts };
|
||||||
|
}
|
||||||
|
attempts.push(
|
||||||
|
buildModelDecision({ entry, entryType, outcome: "skipped", reason: "empty output" }),
|
||||||
|
);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (isMediaUnderstandingSkipError(err)) {
|
if (isMediaUnderstandingSkipError(err)) {
|
||||||
|
attempts.push(
|
||||||
|
buildModelDecision({
|
||||||
|
entry,
|
||||||
|
entryType: entry.type ?? (entry.command ? "cli" : "provider"),
|
||||||
|
outcome: "skipped",
|
||||||
|
reason: `${err.reason}: ${err.message}`,
|
||||||
|
}),
|
||||||
|
);
|
||||||
if (shouldLogVerbose()) {
|
if (shouldLogVerbose()) {
|
||||||
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
logVerbose(`Skipping ${capability} model due to ${err.reason}: ${err.message}`);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
attempts.push(
|
||||||
|
buildModelDecision({
|
||||||
|
entry,
|
||||||
|
entryType: entry.type ?? (entry.command ? "cli" : "provider"),
|
||||||
|
outcome: "failed",
|
||||||
|
reason: String(err),
|
||||||
|
}),
|
||||||
|
);
|
||||||
if (shouldLogVerbose()) {
|
if (shouldLogVerbose()) {
|
||||||
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
logVerbose(`${capability} understanding failed: ${String(err)}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return { output: null, attempts };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runCapability(params: {
|
async function runCapability(params: {
|
||||||
@@ -350,33 +437,74 @@ async function runCapability(params: {
|
|||||||
cfg: ClawdbotConfig;
|
cfg: ClawdbotConfig;
|
||||||
ctx: MsgContext;
|
ctx: MsgContext;
|
||||||
attachments: MediaAttachmentCache;
|
attachments: MediaAttachmentCache;
|
||||||
attachmentIds: number[];
|
media: ReturnType<typeof normalizeAttachments>;
|
||||||
agentDir?: string;
|
agentDir?: string;
|
||||||
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
providerRegistry: Map<string, MediaUnderstandingProvider>;
|
||||||
config?: MediaUnderstandingConfig;
|
config?: MediaUnderstandingConfig;
|
||||||
}): Promise<MediaUnderstandingOutput[]> {
|
activeModel?: ActiveMediaModel;
|
||||||
|
}): Promise<{ outputs: MediaUnderstandingOutput[]; decision: MediaUnderstandingDecision }> {
|
||||||
const { capability, cfg, ctx } = params;
|
const { capability, cfg, ctx } = params;
|
||||||
const config = params.config ?? resolveCapabilityConfig(cfg, capability);
|
const config = params.config ?? resolveCapabilityConfig(cfg, capability);
|
||||||
if (!resolveCapabilityEnabled({ cfg, config })) return [];
|
if (config?.enabled === false) {
|
||||||
|
return {
|
||||||
|
outputs: [],
|
||||||
|
decision: { capability, outcome: "disabled", attachments: [] },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const entries = resolveModelEntries({ cfg, capability, config });
|
const attachmentPolicy = config?.attachments;
|
||||||
if (entries.length === 0) return [];
|
const selected = selectAttachments({
|
||||||
|
capability,
|
||||||
|
attachments: params.media,
|
||||||
|
policy: attachmentPolicy,
|
||||||
|
});
|
||||||
|
if (selected.length === 0) {
|
||||||
|
return {
|
||||||
|
outputs: [],
|
||||||
|
decision: { capability, outcome: "no-attachment", attachments: [] },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
const scopeDecision = resolveScopeDecision({ scope: config?.scope, ctx });
|
||||||
if (scopeDecision === "deny") {
|
if (scopeDecision === "deny") {
|
||||||
if (shouldLogVerbose()) {
|
if (shouldLogVerbose()) {
|
||||||
logVerbose(`${capability} understanding disabled by scope policy.`);
|
logVerbose(`${capability} understanding disabled by scope policy.`);
|
||||||
}
|
}
|
||||||
return [];
|
return {
|
||||||
|
outputs: [],
|
||||||
|
decision: {
|
||||||
|
capability,
|
||||||
|
outcome: "scope-deny",
|
||||||
|
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const entries = resolveEntriesWithActiveFallback({
|
||||||
|
cfg,
|
||||||
|
capability,
|
||||||
|
config,
|
||||||
|
activeModel: params.activeModel,
|
||||||
|
});
|
||||||
|
if (entries.length === 0) {
|
||||||
|
return {
|
||||||
|
outputs: [],
|
||||||
|
decision: {
|
||||||
|
capability,
|
||||||
|
outcome: "skipped",
|
||||||
|
attachments: selected.map((item) => ({ attachmentIndex: item.index, attempts: [] })),
|
||||||
|
},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const outputs: MediaUnderstandingOutput[] = [];
|
const outputs: MediaUnderstandingOutput[] = [];
|
||||||
for (const attachmentIndex of params.attachmentIds) {
|
const attachmentDecisions: MediaUnderstandingDecision["attachments"] = [];
|
||||||
const output = await runAttachmentEntries({
|
for (const attachment of selected) {
|
||||||
|
const { output, attempts } = await runAttachmentEntries({
|
||||||
capability,
|
capability,
|
||||||
cfg,
|
cfg,
|
||||||
ctx,
|
ctx,
|
||||||
attachmentIndex,
|
attachmentIndex: attachment.index,
|
||||||
agentDir: params.agentDir,
|
agentDir: params.agentDir,
|
||||||
providerRegistry: params.providerRegistry,
|
providerRegistry: params.providerRegistry,
|
||||||
cache: params.attachments,
|
cache: params.attachments,
|
||||||
@@ -384,8 +512,20 @@ async function runCapability(params: {
|
|||||||
config,
|
config,
|
||||||
});
|
});
|
||||||
if (output) outputs.push(output);
|
if (output) outputs.push(output);
|
||||||
|
attachmentDecisions.push({
|
||||||
|
attachmentIndex: attachment.index,
|
||||||
|
attempts,
|
||||||
|
chosen: attempts.find((attempt) => attempt.outcome === "success"),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
return outputs;
|
return {
|
||||||
|
outputs,
|
||||||
|
decision: {
|
||||||
|
capability,
|
||||||
|
outcome: outputs.length > 0 ? "success" : "skipped",
|
||||||
|
attachments: attachmentDecisions,
|
||||||
|
},
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function applyMediaUnderstanding(params: {
|
export async function applyMediaUnderstanding(params: {
|
||||||
@@ -393,6 +533,7 @@ export async function applyMediaUnderstanding(params: {
|
|||||||
cfg: ClawdbotConfig;
|
cfg: ClawdbotConfig;
|
||||||
agentDir?: string;
|
agentDir?: string;
|
||||||
providers?: Record<string, MediaUnderstandingProvider>;
|
providers?: Record<string, MediaUnderstandingProvider>;
|
||||||
|
activeModel?: ActiveMediaModel;
|
||||||
}): Promise<ApplyMediaUnderstandingResult> {
|
}): Promise<ApplyMediaUnderstandingResult> {
|
||||||
const { ctx, cfg } = params;
|
const { ctx, cfg } = params;
|
||||||
const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
|
const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
|
||||||
@@ -408,33 +549,40 @@ export async function applyMediaUnderstanding(params: {
|
|||||||
try {
|
try {
|
||||||
const tasks = CAPABILITY_ORDER.map((capability) => async () => {
|
const tasks = CAPABILITY_ORDER.map((capability) => async () => {
|
||||||
const config = resolveCapabilityConfig(cfg, capability);
|
const config = resolveCapabilityConfig(cfg, capability);
|
||||||
const attachmentPolicy = config?.attachments;
|
|
||||||
const selected = selectAttachments({
|
|
||||||
capability,
|
|
||||||
attachments,
|
|
||||||
policy: attachmentPolicy,
|
|
||||||
});
|
|
||||||
if (selected.length === 0) return [] as MediaUnderstandingOutput[];
|
|
||||||
return await runCapability({
|
return await runCapability({
|
||||||
capability,
|
capability,
|
||||||
cfg,
|
cfg,
|
||||||
ctx,
|
ctx,
|
||||||
attachments: cache,
|
attachments: cache,
|
||||||
attachmentIds: selected.map((item) => item.index),
|
media: attachments,
|
||||||
agentDir: params.agentDir,
|
agentDir: params.agentDir,
|
||||||
providerRegistry,
|
providerRegistry,
|
||||||
config,
|
config,
|
||||||
|
activeModel: params.activeModel,
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
|
const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
|
||||||
const outputs: MediaUnderstandingOutput[] = [];
|
const outputs: MediaUnderstandingOutput[] = [];
|
||||||
|
const decisions: MediaUnderstandingDecision[] = [];
|
||||||
for (const [index] of CAPABILITY_ORDER.entries()) {
|
for (const [index] of CAPABILITY_ORDER.entries()) {
|
||||||
const entries = results[index] ?? [];
|
const entry = results[index];
|
||||||
if (!Array.isArray(entries)) continue;
|
if (!entry) continue;
|
||||||
for (const entry of entries) {
|
if (Array.isArray(entry.outputs)) {
|
||||||
outputs.push(entry);
|
for (const output of entry.outputs) {
|
||||||
|
outputs.push(output);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if (entry.decision) {
|
||||||
|
decisions.push(entry.decision);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (decisions.length > 0) {
|
||||||
|
ctx.MediaUnderstandingDecisions = [
|
||||||
|
...(ctx.MediaUnderstandingDecisions ?? []),
|
||||||
|
...decisions,
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (outputs.length > 0) {
|
if (outputs.length > 0) {
|
||||||
@@ -443,8 +591,13 @@ export async function applyMediaUnderstanding(params: {
|
|||||||
if (audioOutputs.length > 0) {
|
if (audioOutputs.length > 0) {
|
||||||
const transcript = formatAudioTranscripts(audioOutputs);
|
const transcript = formatAudioTranscripts(audioOutputs);
|
||||||
ctx.Transcript = transcript;
|
ctx.Transcript = transcript;
|
||||||
ctx.CommandBody = transcript;
|
if (originalUserText) {
|
||||||
ctx.RawBody = transcript;
|
ctx.CommandBody = originalUserText;
|
||||||
|
ctx.RawBody = originalUserText;
|
||||||
|
} else {
|
||||||
|
ctx.CommandBody = transcript;
|
||||||
|
ctx.RawBody = transcript;
|
||||||
|
}
|
||||||
} else if (originalUserText) {
|
} else if (originalUserText) {
|
||||||
ctx.CommandBody = originalUserText;
|
ctx.CommandBody = originalUserText;
|
||||||
ctx.RawBody = originalUserText;
|
ctx.RawBody = originalUserText;
|
||||||
@@ -455,6 +608,7 @@ export async function applyMediaUnderstanding(params: {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
outputs,
|
outputs,
|
||||||
|
decisions,
|
||||||
appliedImage: outputs.some((output) => output.kind === "image.description"),
|
appliedImage: outputs.some((output) => output.kind === "image.description"),
|
||||||
appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
|
appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
|
||||||
appliedVideo: outputs.some((output) => output.kind === "video.description"),
|
appliedVideo: outputs.some((output) => output.kind === "video.description"),
|
||||||
|
|||||||
@@ -77,13 +77,10 @@ export function resolveScopeDecision(params: {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function inferCapabilities(
|
export function inferProviderCapabilities(
|
||||||
entry: MediaUnderstandingModelConfig,
|
providerId?: string,
|
||||||
): MediaUnderstandingCapability[] | undefined {
|
): MediaUnderstandingCapability[] | undefined {
|
||||||
if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
|
const provider = normalizeMediaProviderId(providerId ?? "");
|
||||||
return ["image", "audio", "video"];
|
|
||||||
}
|
|
||||||
const provider = normalizeMediaProviderId(entry.provider ?? "");
|
|
||||||
if (!provider) return undefined;
|
if (!provider) return undefined;
|
||||||
if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
|
if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
|
||||||
return ["image"];
|
return ["image"];
|
||||||
@@ -97,6 +94,15 @@ function inferCapabilities(
|
|||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function inferCapabilities(
|
||||||
|
entry: MediaUnderstandingModelConfig,
|
||||||
|
): MediaUnderstandingCapability[] | undefined {
|
||||||
|
if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return inferProviderCapabilities(entry.provider);
|
||||||
|
}
|
||||||
|
|
||||||
export function resolveModelEntries(params: {
|
export function resolveModelEntries(params: {
|
||||||
cfg: ClawdbotConfig;
|
cfg: ClawdbotConfig;
|
||||||
capability: MediaUnderstandingCapability;
|
capability: MediaUnderstandingCapability;
|
||||||
|
|||||||
Reference in New Issue
Block a user