refactor: unify media understanding pipeline
This commit is contained in:
@@ -1770,13 +1770,16 @@ Legacy: `tools.bash` is still accepted as an alias.
|
|||||||
- `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
|
- `tools.web.fetch.firecrawl.timeoutSeconds` (optional)
|
||||||
|
|
||||||
`tools.media` configures inbound media understanding (image/audio/video):
|
`tools.media` configures inbound media understanding (image/audio/video):
|
||||||
|
- `tools.media.models`: shared model list (capability-tagged; used after per-cap lists).
|
||||||
|
- `tools.media.concurrency`: max concurrent capability runs (default 2).
|
||||||
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
||||||
- `enabled`: opt-out switch (default true).
|
- `enabled`: opt-out switch (default true when models are configured).
|
||||||
- `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
|
- `prompt`: optional prompt override (image/video append a `maxChars` hint automatically).
|
||||||
- `maxChars`: max output characters (default 500 for image/video; unset for audio).
|
- `maxChars`: max output characters (default 500 for image/video; unset for audio).
|
||||||
- `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
|
- `maxBytes`: max media size to send (defaults: image 10MB, audio 20MB, video 50MB).
|
||||||
- `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
|
- `timeoutSeconds`: request timeout (defaults: image 60s, audio 60s, video 120s).
|
||||||
- `language`: optional audio hint.
|
- `language`: optional audio hint.
|
||||||
|
- `attachments`: attachment policy (`mode`, `maxAttachments`, `prefer`).
|
||||||
- `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
|
- `scope`: optional gating (first match wins) with `match.channel`, `match.chatType`, or `match.keyPrefix`.
|
||||||
- `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
|
- `models`: ordered list of model entries; failures or oversize media fall back to the next entry.
|
||||||
- Each `models[]` entry:
|
- Each `models[]` entry:
|
||||||
@@ -1787,7 +1790,7 @@ Legacy: `tools.bash` is still accepted as an alias.
|
|||||||
- CLI entry (`type: "cli"`):
|
- CLI entry (`type: "cli"`):
|
||||||
- `command`: executable to run.
|
- `command`: executable to run.
|
||||||
- `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
|
- `args`: templated args (supports `{{MediaPath}}`, `{{Prompt}}`, `{{MaxChars}}`, etc).
|
||||||
- `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry.
|
- `capabilities`: optional list (`image`, `audio`, `video`) to gate a shared entry. Defaults when omitted: `openai`/`anthropic`/`minimax` → image, `google` → image+audio+video, `groq` → audio.
|
||||||
- `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
|
- `prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language` can be overridden per entry.
|
||||||
|
|
||||||
If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
|
If no models are configured (or `enabled: false`), understanding is skipped; the model still receives the original attachments.
|
||||||
@@ -2900,7 +2903,7 @@ clawdbot dns setup --apply
|
|||||||
|
|
||||||
## Template variables
|
## Template variables
|
||||||
|
|
||||||
Template placeholders are expanded in `tools.media.*.models[].args` (and any future templated argument fields).
|
Template placeholders are expanded in `tools.media.*.models[].args` and `tools.media.models[].args` (and any future templated argument fields).
|
||||||
|
|
||||||
| Variable | Description |
|
| Variable | Description |
|
||||||
|----------|-------------|
|
|----------|-------------|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ read_when:
|
|||||||
# Audio / Voice Notes — 2026-01-17
|
# Audio / Voice Notes — 2026-01-17
|
||||||
|
|
||||||
## What works
|
## What works
|
||||||
- **Media understanding (audio)**: If `tools.media.audio` is enabled and has `models`, Clawdbot:
|
- **Media understanding (audio)**: If `tools.media.audio` is enabled (or a shared `tools.media.models` entry supports audio), Clawdbot:
|
||||||
1) Locates the first audio attachment (local path or URL) and downloads it if needed.
|
1) Locates the first audio attachment (local path or URL) and downloads it if needed.
|
||||||
2) Enforces `maxBytes` before sending to each model entry.
|
2) Enforces `maxBytes` before sending to each model entry.
|
||||||
3) Runs the first eligible model entry in order (provider or CLI).
|
3) Runs the first eligible model entry in order (provider or CLI).
|
||||||
@@ -66,6 +66,7 @@ read_when:
|
|||||||
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
- Provider auth follows the standard model auth order (auth profiles, env vars, `models.providers.*.apiKey`).
|
||||||
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
- Default size cap is 20MB (`tools.media.audio.maxBytes`). Oversize audio is skipped for that model and the next entry is tried.
|
||||||
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
- Default `maxChars` for audio is **unset** (full transcript). Set `tools.media.audio.maxChars` or per-entry `maxChars` to trim output.
|
||||||
|
- Use `tools.media.audio.attachments` to process multiple voice notes (`mode: "all"` + `maxAttachments`).
|
||||||
- Transcript is available to templates as `{{Transcript}}`.
|
- Transcript is available to templates as `{{Transcript}}`.
|
||||||
- CLI stdout is capped (5MB); keep CLI output concise.
|
- CLI stdout is capped (5MB); keep CLI output concise.
|
||||||
|
|
||||||
|
|||||||
@@ -38,10 +38,10 @@ The WhatsApp channel runs via **Baileys Web**. This document captures the curren
|
|||||||
- `{{MediaUrl}}` pseudo-URL for the inbound media.
|
- `{{MediaUrl}}` pseudo-URL for the inbound media.
|
||||||
- `{{MediaPath}}` local temp path written before running the command.
|
- `{{MediaPath}}` local temp path written before running the command.
|
||||||
- When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
|
- When a per-session Docker sandbox is enabled, inbound media is copied into the sandbox workspace and `MediaPath`/`MediaUrl` are rewritten to a relative path like `media/inbound/<filename>`.
|
||||||
- Media understanding (if configured via `tools.media.*`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
|
- Media understanding (if configured via `tools.media.*` or shared `tools.media.models`) runs before templating and can insert `[Image]`, `[Audio]`, and `[Video]` blocks into `Body`.
|
||||||
- Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
|
- Audio sets `{{Transcript}}` and uses the transcript for command parsing so slash commands still work.
|
||||||
- Video and image descriptions preserve any caption text for command parsing.
|
- Video and image descriptions preserve any caption text for command parsing.
|
||||||
- Only the first matching image/audio/video attachment is processed; remaining attachments are left untouched.
|
- By default only the first matching image/audio/video attachment is processed; set `tools.media.<cap>.attachments` to process multiple attachments.
|
||||||
|
|
||||||
## Limits & Errors
|
## Limits & Errors
|
||||||
**Outbound send caps (WhatsApp web send)**
|
**Outbound send caps (WhatsApp web send)**
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
|
|||||||
|
|
||||||
## High‑level behavior
|
## High‑level behavior
|
||||||
1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
|
1) Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
|
||||||
2) For each enabled capability (image/audio/video), pick the **first matching attachment**.
|
2) For each enabled capability (image/audio/video), select attachments per policy (default: **first**).
|
||||||
3) Choose the first eligible model entry (size + capability + auth).
|
3) Choose the first eligible model entry (size + capability + auth).
|
||||||
4) If a model fails or the media is too large, **fall back to the next entry**.
|
4) If a model fails or the media is too large, **fall back to the next entry**.
|
||||||
5) On success:
|
5) On success:
|
||||||
@@ -27,18 +27,23 @@ Clawdbot can optionally **summarize inbound media** (image/audio/video) before t
|
|||||||
If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
|
If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
|
||||||
|
|
||||||
## Config overview
|
## Config overview
|
||||||
Use **per‑capability configs** under `tools.media`. Each capability can define:
|
`tools.media` supports **shared models** plus per‑capability overrides:
|
||||||
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
- `tools.media.models`: shared model list (use `capabilities` to gate).
|
||||||
- **ordered `models` list** (fallback order)
|
- `tools.media.image` / `tools.media.audio` / `tools.media.video`:
|
||||||
- `scope` (optional gating by channel/chatType/session key)
|
- defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
|
||||||
|
- optional **per‑capability `models` list** (preferred before shared models)
|
||||||
|
- `attachments` policy (`mode`, `maxAttachments`, `prefer`)
|
||||||
|
- `scope` (optional gating by channel/chatType/session key)
|
||||||
|
- `tools.media.concurrency`: max concurrent capability runs (default **2**).
|
||||||
|
|
||||||
```json5
|
```json5
|
||||||
{
|
{
|
||||||
tools: {
|
tools: {
|
||||||
media: {
|
media: {
|
||||||
image: { /* config */ },
|
models: [ /* shared list */ ],
|
||||||
audio: { /* config */ },
|
image: { /* optional overrides */ },
|
||||||
video: { /* config */ }
|
audio: { /* optional overrides */ },
|
||||||
|
video: { /* optional overrides */ }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -95,12 +100,13 @@ Rules:
|
|||||||
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
- `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
|
||||||
|
|
||||||
## Capabilities (optional)
|
## Capabilities (optional)
|
||||||
If you set `capabilities`, the entry only runs for those media types. Suggested
|
If you set `capabilities`, the entry only runs for those media types. For shared
|
||||||
defaults when you opt in:
|
lists, Clawdbot can infer defaults:
|
||||||
- `openai`, `anthropic`: **image**
|
- `openai`, `anthropic`, `minimax`: **image**
|
||||||
- `google` (Gemini API): **image + audio + video**
|
- `google` (Gemini API): **image + audio + video**
|
||||||
- CLI entries: declare the exact capabilities you support.
|
- `groq`: **audio**
|
||||||
|
|
||||||
|
For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
|
||||||
If you omit `capabilities`, the entry is eligible for the list it appears in.
|
If you omit `capabilities`, the entry is eligible for the list it appears in.
|
||||||
|
|
||||||
## Provider support matrix (Clawdbot integrations)
|
## Provider support matrix (Clawdbot integrations)
|
||||||
@@ -123,9 +129,49 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
|
|||||||
- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
|
- `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
|
||||||
- CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
|
- CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
|
||||||
|
|
||||||
|
## Attachment policy
|
||||||
|
Per‑capability `attachments` controls which attachments are processed:
|
||||||
|
- `mode`: `first` (default) or `all`
|
||||||
|
- `maxAttachments`: cap the number processed (default **1**)
|
||||||
|
- `prefer`: `first`, `last`, `path`, `url`
|
||||||
|
|
||||||
|
When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
|
||||||
|
|
||||||
## Config examples
|
## Config examples
|
||||||
|
|
||||||
### 1) Audio + Video only (image off)
|
### 1) Shared models list + overrides
|
||||||
|
```json5
|
||||||
|
{
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
models: [
|
||||||
|
{ provider: "openai", model: "gpt-5.2", capabilities: ["image"] },
|
||||||
|
{ provider: "google", model: "gemini-3-flash-preview", capabilities: ["image", "audio", "video"] },
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "gemini",
|
||||||
|
args: [
|
||||||
|
"-m",
|
||||||
|
"gemini-3-flash",
|
||||||
|
"--allowed-tools",
|
||||||
|
"read_file",
|
||||||
|
"Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters."
|
||||||
|
],
|
||||||
|
capabilities: ["image", "video"]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
audio: {
|
||||||
|
attachments: { mode: "all", maxAttachments: 2 }
|
||||||
|
},
|
||||||
|
video: {
|
||||||
|
maxChars: 500
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2) Audio + Video only (image off)
|
||||||
```json5
|
```json5
|
||||||
{
|
{
|
||||||
tools: {
|
tools: {
|
||||||
@@ -164,7 +210,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2) Optional image understanding
|
### 3) Optional image understanding
|
||||||
```json5
|
```json5
|
||||||
{
|
{
|
||||||
tools: {
|
tools: {
|
||||||
@@ -194,7 +240,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3) Multi‑modal single entry (explicit capabilities)
|
### 4) Multi‑modal single entry (explicit capabilities)
|
||||||
```json5
|
```json5
|
||||||
{
|
{
|
||||||
tools: {
|
tools: {
|
||||||
|
|||||||
@@ -107,14 +107,18 @@ const FIELD_LABELS: Record<string, string> = {
|
|||||||
"tools.media.image.maxChars": "Image Understanding Max Chars",
|
"tools.media.image.maxChars": "Image Understanding Max Chars",
|
||||||
"tools.media.image.prompt": "Image Understanding Prompt",
|
"tools.media.image.prompt": "Image Understanding Prompt",
|
||||||
"tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
|
"tools.media.image.timeoutSeconds": "Image Understanding Timeout (sec)",
|
||||||
|
"tools.media.image.attachments": "Image Understanding Attachment Policy",
|
||||||
"tools.media.image.models": "Image Understanding Models",
|
"tools.media.image.models": "Image Understanding Models",
|
||||||
"tools.media.image.scope": "Image Understanding Scope",
|
"tools.media.image.scope": "Image Understanding Scope",
|
||||||
|
"tools.media.models": "Media Understanding Shared Models",
|
||||||
|
"tools.media.concurrency": "Media Understanding Concurrency",
|
||||||
"tools.media.audio.enabled": "Enable Audio Understanding",
|
"tools.media.audio.enabled": "Enable Audio Understanding",
|
||||||
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
|
"tools.media.audio.maxBytes": "Audio Understanding Max Bytes",
|
||||||
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
|
"tools.media.audio.maxChars": "Audio Understanding Max Chars",
|
||||||
"tools.media.audio.prompt": "Audio Understanding Prompt",
|
"tools.media.audio.prompt": "Audio Understanding Prompt",
|
||||||
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
|
"tools.media.audio.timeoutSeconds": "Audio Understanding Timeout (sec)",
|
||||||
"tools.media.audio.language": "Audio Understanding Language",
|
"tools.media.audio.language": "Audio Understanding Language",
|
||||||
|
"tools.media.audio.attachments": "Audio Understanding Attachment Policy",
|
||||||
"tools.media.audio.models": "Audio Understanding Models",
|
"tools.media.audio.models": "Audio Understanding Models",
|
||||||
"tools.media.audio.scope": "Audio Understanding Scope",
|
"tools.media.audio.scope": "Audio Understanding Scope",
|
||||||
"tools.media.video.enabled": "Enable Video Understanding",
|
"tools.media.video.enabled": "Enable Video Understanding",
|
||||||
@@ -122,6 +126,7 @@ const FIELD_LABELS: Record<string, string> = {
|
|||||||
"tools.media.video.maxChars": "Video Understanding Max Chars",
|
"tools.media.video.maxChars": "Video Understanding Max Chars",
|
||||||
"tools.media.video.prompt": "Video Understanding Prompt",
|
"tools.media.video.prompt": "Video Understanding Prompt",
|
||||||
"tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
|
"tools.media.video.timeoutSeconds": "Video Understanding Timeout (sec)",
|
||||||
|
"tools.media.video.attachments": "Video Understanding Attachment Policy",
|
||||||
"tools.media.video.models": "Video Understanding Models",
|
"tools.media.video.models": "Video Understanding Models",
|
||||||
"tools.media.video.scope": "Video Understanding Scope",
|
"tools.media.video.scope": "Video Understanding Scope",
|
||||||
"tools.profile": "Tool Profile",
|
"tools.profile": "Tool Profile",
|
||||||
|
|||||||
@@ -18,6 +18,15 @@ export type MediaUnderstandingScopeConfig = {
|
|||||||
|
|
||||||
export type MediaUnderstandingCapability = "image" | "audio" | "video";
|
export type MediaUnderstandingCapability = "image" | "audio" | "video";
|
||||||
|
|
||||||
|
export type MediaUnderstandingAttachmentsConfig = {
|
||||||
|
/** Select the first matching attachment or process multiple. */
|
||||||
|
mode?: "first" | "all";
|
||||||
|
/** Max number of attachments to process (default: 1). */
|
||||||
|
maxAttachments?: number;
|
||||||
|
/** Attachment ordering preference. */
|
||||||
|
prefer?: "first" | "last" | "path" | "url";
|
||||||
|
};
|
||||||
|
|
||||||
export type MediaUnderstandingModelConfig = {
|
export type MediaUnderstandingModelConfig = {
|
||||||
/** provider API id (e.g. openai, google). */
|
/** provider API id (e.g. openai, google). */
|
||||||
provider?: string;
|
provider?: string;
|
||||||
@@ -62,11 +71,17 @@ export type MediaUnderstandingConfig = {
|
|||||||
timeoutSeconds?: number;
|
timeoutSeconds?: number;
|
||||||
/** Default language hint (audio). */
|
/** Default language hint (audio). */
|
||||||
language?: string;
|
language?: string;
|
||||||
|
/** Attachment selection policy. */
|
||||||
|
attachments?: MediaUnderstandingAttachmentsConfig;
|
||||||
/** Ordered model list (fallbacks in order). */
|
/** Ordered model list (fallbacks in order). */
|
||||||
models?: MediaUnderstandingModelConfig[];
|
models?: MediaUnderstandingModelConfig[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type MediaToolsConfig = {
|
export type MediaToolsConfig = {
|
||||||
|
/** Shared model list applied across image/audio/video. */
|
||||||
|
models?: MediaUnderstandingModelConfig[];
|
||||||
|
/** Max concurrent media understanding runs. */
|
||||||
|
concurrency?: number;
|
||||||
image?: MediaUnderstandingConfig;
|
image?: MediaUnderstandingConfig;
|
||||||
audio?: MediaUnderstandingConfig;
|
audio?: MediaUnderstandingConfig;
|
||||||
video?: MediaUnderstandingConfig;
|
video?: MediaUnderstandingConfig;
|
||||||
|
|||||||
@@ -271,6 +271,14 @@ export const MediaUnderstandingCapabilitiesSchema = z
|
|||||||
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
|
.array(z.union([z.literal("image"), z.literal("audio"), z.literal("video")]))
|
||||||
.optional();
|
.optional();
|
||||||
|
|
||||||
|
export const MediaUnderstandingAttachmentsSchema = z
|
||||||
|
.object({
|
||||||
|
mode: z.union([z.literal("first"), z.literal("all")]).optional(),
|
||||||
|
maxAttachments: z.number().int().positive().optional(),
|
||||||
|
prefer: z.union([z.literal("first"), z.literal("last"), z.literal("path"), z.literal("url")]).optional(),
|
||||||
|
})
|
||||||
|
.optional();
|
||||||
|
|
||||||
export const MediaUnderstandingModelSchema = z
|
export const MediaUnderstandingModelSchema = z
|
||||||
.object({
|
.object({
|
||||||
provider: z.string().optional(),
|
provider: z.string().optional(),
|
||||||
@@ -298,12 +306,15 @@ export const ToolsMediaUnderstandingSchema = z
|
|||||||
prompt: z.string().optional(),
|
prompt: z.string().optional(),
|
||||||
timeoutSeconds: z.number().int().positive().optional(),
|
timeoutSeconds: z.number().int().positive().optional(),
|
||||||
language: z.string().optional(),
|
language: z.string().optional(),
|
||||||
|
attachments: MediaUnderstandingAttachmentsSchema,
|
||||||
models: z.array(MediaUnderstandingModelSchema).optional(),
|
models: z.array(MediaUnderstandingModelSchema).optional(),
|
||||||
})
|
})
|
||||||
.optional();
|
.optional();
|
||||||
|
|
||||||
export const ToolsMediaSchema = z
|
export const ToolsMediaSchema = z
|
||||||
.object({
|
.object({
|
||||||
|
models: z.array(MediaUnderstandingModelSchema).optional(),
|
||||||
|
concurrency: z.number().int().positive().optional(),
|
||||||
image: ToolsMediaUnderstandingSchema.optional(),
|
image: ToolsMediaUnderstandingSchema.optional(),
|
||||||
audio: ToolsMediaUnderstandingSchema.optional(),
|
audio: ToolsMediaUnderstandingSchema.optional(),
|
||||||
video: ToolsMediaUnderstandingSchema.optional(),
|
video: ToolsMediaUnderstandingSchema.optional(),
|
||||||
|
|||||||
@@ -255,4 +255,90 @@ describe("applyMediaUnderstanding", () => {
|
|||||||
expect(ctx.CommandBody).toBe("show Dom");
|
expect(ctx.CommandBody).toBe("show Dom");
|
||||||
expect(ctx.RawBody).toBe("show Dom");
|
expect(ctx.RawBody).toBe("show Dom");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("uses shared media models list when capability config is missing", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const imagePath = path.join(dir, "shared.jpg");
|
||||||
|
await fs.writeFile(imagePath, "image-bytes");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:image>",
|
||||||
|
MediaPath: imagePath,
|
||||||
|
MediaType: "image/jpeg",
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
models: [
|
||||||
|
{
|
||||||
|
type: "cli",
|
||||||
|
command: "gemini",
|
||||||
|
args: ["--allowed-tools", "read_file", "{{MediaPath}}"],
|
||||||
|
capabilities: ["image"],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const execModule = await import("../process/exec.js");
|
||||||
|
vi.mocked(execModule.runExec).mockResolvedValue({
|
||||||
|
stdout: "shared description\n",
|
||||||
|
stderr: "",
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedImage).toBe(true);
|
||||||
|
expect(ctx.Body).toBe("[Image]\nDescription:\nshared description");
|
||||||
|
});
|
||||||
|
|
||||||
|
it("handles multiple audio attachments when attachment mode is all", async () => {
|
||||||
|
const { applyMediaUnderstanding } = await loadApply();
|
||||||
|
const dir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-media-"));
|
||||||
|
const audioPathA = path.join(dir, "note-a.ogg");
|
||||||
|
const audioPathB = path.join(dir, "note-b.ogg");
|
||||||
|
await fs.writeFile(audioPathA, "hello");
|
||||||
|
await fs.writeFile(audioPathB, "world");
|
||||||
|
|
||||||
|
const ctx: MsgContext = {
|
||||||
|
Body: "<media:audio>",
|
||||||
|
MediaPaths: [audioPathA, audioPathB],
|
||||||
|
MediaTypes: ["audio/ogg", "audio/ogg"],
|
||||||
|
};
|
||||||
|
const cfg: ClawdbotConfig = {
|
||||||
|
tools: {
|
||||||
|
media: {
|
||||||
|
audio: {
|
||||||
|
enabled: true,
|
||||||
|
attachments: { mode: "all", maxAttachments: 2 },
|
||||||
|
models: [{ provider: "groq" }],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await applyMediaUnderstanding({
|
||||||
|
ctx,
|
||||||
|
cfg,
|
||||||
|
providers: {
|
||||||
|
groq: {
|
||||||
|
id: "groq",
|
||||||
|
transcribeAudio: async (req) => ({ text: req.fileName }),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.appliedAudio).toBe(true);
|
||||||
|
expect(ctx.Transcript).toBe("Audio 1:\nnote-a.ogg\n\nAudio 2:\nnote-b.ogg");
|
||||||
|
expect(ctx.Body).toBe(
|
||||||
|
["[Audio 1/2]\nTranscript:\nnote-a.ogg", "[Audio 2/2]\nTranscript:\nnote-b.ogg"].join(
|
||||||
|
"\n\n",
|
||||||
|
),
|
||||||
|
);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
386
src/media-understanding/attachments.ts
Normal file
386
src/media-understanding/attachments.ts
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
import crypto from "node:crypto";
|
||||||
|
import fs from "node:fs/promises";
|
||||||
|
import os from "node:os";
|
||||||
|
import path from "node:path";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import type { MediaUnderstandingAttachmentsConfig } from "../config/types.tools.js";
|
||||||
|
import { fetchRemoteMedia, MediaFetchError } from "../media/fetch.js";
|
||||||
|
import { detectMime, getFileExtension, isAudioFileName } from "../media/mime.js";
|
||||||
|
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||||
|
import { fetchWithTimeout } from "./providers/shared.js";
|
||||||
|
import type { MediaAttachment, MediaUnderstandingCapability } from "./types.js";
|
||||||
|
import { MediaUnderstandingSkipError } from "./errors.js";
|
||||||
|
|
||||||
|
type MediaBufferResult = {
|
||||||
|
buffer: Buffer;
|
||||||
|
mime?: string;
|
||||||
|
fileName: string;
|
||||||
|
size: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
type MediaPathResult = {
|
||||||
|
path: string;
|
||||||
|
cleanup?: () => Promise<void> | void;
|
||||||
|
};
|
||||||
|
|
||||||
|
type AttachmentCacheEntry = {
|
||||||
|
attachment: MediaAttachment;
|
||||||
|
resolvedPath?: string;
|
||||||
|
statSize?: number;
|
||||||
|
buffer?: Buffer;
|
||||||
|
bufferMime?: string;
|
||||||
|
bufferFileName?: string;
|
||||||
|
tempPath?: string;
|
||||||
|
tempCleanup?: () => Promise<void>;
|
||||||
|
};
|
||||||
|
|
||||||
|
const DEFAULT_MAX_ATTACHMENTS = 1;
|
||||||
|
|
||||||
|
function normalizeAttachmentPath(raw?: string | null): string | undefined {
|
||||||
|
const value = raw?.trim();
|
||||||
|
if (!value) return undefined;
|
||||||
|
if (value.startsWith("file://")) {
|
||||||
|
try {
|
||||||
|
return fileURLToPath(value);
|
||||||
|
} catch {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function normalizeAttachments(ctx: MsgContext): MediaAttachment[] {
|
||||||
|
const pathsFromArray = Array.isArray(ctx.MediaPaths) ? ctx.MediaPaths : undefined;
|
||||||
|
const urlsFromArray = Array.isArray(ctx.MediaUrls) ? ctx.MediaUrls : undefined;
|
||||||
|
const typesFromArray = Array.isArray(ctx.MediaTypes) ? ctx.MediaTypes : undefined;
|
||||||
|
const resolveMime = (count: number, index: number) => {
|
||||||
|
const typeHint = typesFromArray?.[index];
|
||||||
|
const trimmed = typeof typeHint === "string" ? typeHint.trim() : "";
|
||||||
|
if (trimmed) return trimmed;
|
||||||
|
return count === 1 ? ctx.MediaType : undefined;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (pathsFromArray && pathsFromArray.length > 0) {
|
||||||
|
const count = pathsFromArray.length;
|
||||||
|
const urls = urlsFromArray && urlsFromArray.length > 0 ? urlsFromArray : undefined;
|
||||||
|
return pathsFromArray
|
||||||
|
.map((value, index) => ({
|
||||||
|
path: value?.trim() || undefined,
|
||||||
|
url: urls?.[index] ?? ctx.MediaUrl,
|
||||||
|
mime: resolveMime(count, index),
|
||||||
|
index,
|
||||||
|
}))
|
||||||
|
.filter((entry) => Boolean(entry.path?.trim() || entry.url?.trim()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (urlsFromArray && urlsFromArray.length > 0) {
|
||||||
|
const count = urlsFromArray.length;
|
||||||
|
return urlsFromArray
|
||||||
|
.map((value, index) => ({
|
||||||
|
path: undefined,
|
||||||
|
url: value?.trim() || undefined,
|
||||||
|
mime: resolveMime(count, index),
|
||||||
|
index,
|
||||||
|
}))
|
||||||
|
.filter((entry) => Boolean(entry.url?.trim()));
|
||||||
|
}
|
||||||
|
|
||||||
|
const pathValue = ctx.MediaPath?.trim();
|
||||||
|
const url = ctx.MediaUrl?.trim();
|
||||||
|
if (!pathValue && !url) return [];
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
path: pathValue || undefined,
|
||||||
|
url: url || undefined,
|
||||||
|
mime: ctx.MediaType,
|
||||||
|
index: 0,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isVideoAttachment(attachment: MediaAttachment): boolean {
|
||||||
|
if (attachment.mime?.startsWith("video/")) return true;
|
||||||
|
const ext = getFileExtension(attachment.path ?? attachment.url);
|
||||||
|
if (!ext) return false;
|
||||||
|
return [".mp4", ".mov", ".mkv", ".webm", ".avi", ".m4v"].includes(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isAudioAttachment(attachment: MediaAttachment): boolean {
|
||||||
|
if (attachment.mime?.startsWith("audio/")) return true;
|
||||||
|
return isAudioFileName(attachment.path ?? attachment.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isImageAttachment(attachment: MediaAttachment): boolean {
|
||||||
|
if (attachment.mime?.startsWith("image/")) return true;
|
||||||
|
const ext = getFileExtension(attachment.path ?? attachment.url);
|
||||||
|
if (!ext) return false;
|
||||||
|
return [".png", ".jpg", ".jpeg", ".webp", ".gif", ".bmp", ".tiff", ".tif"].includes(ext);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isAbortError(err: unknown): boolean {
|
||||||
|
if (!err) return false;
|
||||||
|
if (err instanceof Error && err.name === "AbortError") return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveRequestUrl(input: RequestInfo | URL): string {
|
||||||
|
if (typeof input === "string") return input;
|
||||||
|
if (input instanceof URL) return input.toString();
|
||||||
|
return input.url;
|
||||||
|
}
|
||||||
|
|
||||||
|
function orderAttachments(
|
||||||
|
attachments: MediaAttachment[],
|
||||||
|
prefer?: MediaUnderstandingAttachmentsConfig["prefer"],
|
||||||
|
): MediaAttachment[] {
|
||||||
|
if (!prefer || prefer === "first") return attachments;
|
||||||
|
if (prefer === "last") return [...attachments].reverse();
|
||||||
|
if (prefer === "path") {
|
||||||
|
const withPath = attachments.filter((item) => item.path);
|
||||||
|
const withoutPath = attachments.filter((item) => !item.path);
|
||||||
|
return [...withPath, ...withoutPath];
|
||||||
|
}
|
||||||
|
if (prefer === "url") {
|
||||||
|
const withUrl = attachments.filter((item) => item.url);
|
||||||
|
const withoutUrl = attachments.filter((item) => !item.url);
|
||||||
|
return [...withUrl, ...withoutUrl];
|
||||||
|
}
|
||||||
|
return attachments;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function selectAttachments(params: {
|
||||||
|
capability: MediaUnderstandingCapability;
|
||||||
|
attachments: MediaAttachment[];
|
||||||
|
policy?: MediaUnderstandingAttachmentsConfig;
|
||||||
|
}): MediaAttachment[] {
|
||||||
|
const { capability, attachments, policy } = params;
|
||||||
|
const matches = attachments.filter((item) => {
|
||||||
|
if (capability === "image") return isImageAttachment(item);
|
||||||
|
if (capability === "audio") return isAudioAttachment(item);
|
||||||
|
return isVideoAttachment(item);
|
||||||
|
});
|
||||||
|
if (matches.length === 0) return [];
|
||||||
|
|
||||||
|
const ordered = orderAttachments(matches, policy?.prefer);
|
||||||
|
const mode = policy?.mode ?? "first";
|
||||||
|
const maxAttachments = policy?.maxAttachments ?? DEFAULT_MAX_ATTACHMENTS;
|
||||||
|
if (mode === "all") {
|
||||||
|
return ordered.slice(0, Math.max(1, maxAttachments));
|
||||||
|
}
|
||||||
|
return ordered.slice(0, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
export class MediaAttachmentCache {
|
||||||
|
private readonly entries = new Map<number, AttachmentCacheEntry>();
|
||||||
|
private readonly attachments: MediaAttachment[];
|
||||||
|
|
||||||
|
constructor(attachments: MediaAttachment[]) {
|
||||||
|
this.attachments = attachments;
|
||||||
|
for (const attachment of attachments) {
|
||||||
|
this.entries.set(attachment.index, { attachment });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getBuffer(params: {
|
||||||
|
attachmentIndex: number;
|
||||||
|
maxBytes: number;
|
||||||
|
timeoutMs: number;
|
||||||
|
}): Promise<MediaBufferResult> {
|
||||||
|
const entry = await this.ensureEntry(params.attachmentIndex);
|
||||||
|
if (entry.buffer) {
|
||||||
|
if (entry.buffer.length > params.maxBytes) {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"maxBytes",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
buffer: entry.buffer,
|
||||||
|
mime: entry.bufferMime,
|
||||||
|
fileName: entry.bufferFileName ?? `media-${params.attachmentIndex + 1}`,
|
||||||
|
size: entry.buffer.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entry.resolvedPath) {
|
||||||
|
const size = await this.ensureLocalStat(entry);
|
||||||
|
if (entry.resolvedPath) {
|
||||||
|
if (size !== undefined && size > params.maxBytes) {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"maxBytes",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const buffer = await fs.readFile(entry.resolvedPath);
|
||||||
|
entry.buffer = buffer;
|
||||||
|
entry.bufferMime =
|
||||||
|
entry.bufferMime ??
|
||||||
|
entry.attachment.mime ??
|
||||||
|
(await detectMime({
|
||||||
|
buffer,
|
||||||
|
filePath: entry.resolvedPath,
|
||||||
|
}));
|
||||||
|
entry.bufferFileName =
|
||||||
|
path.basename(entry.resolvedPath) || `media-${params.attachmentIndex + 1}`;
|
||||||
|
return {
|
||||||
|
buffer,
|
||||||
|
mime: entry.bufferMime,
|
||||||
|
fileName: entry.bufferFileName,
|
||||||
|
size: buffer.length,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = entry.attachment.url?.trim();
|
||||||
|
if (!url) {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"empty",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} has no path or URL.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const fetchImpl = (input: RequestInfo | URL, init?: RequestInit) =>
|
||||||
|
fetchWithTimeout(resolveRequestUrl(input), init ?? {}, params.timeoutMs, fetch);
|
||||||
|
const fetched = await fetchRemoteMedia({ url, fetchImpl, maxBytes: params.maxBytes });
|
||||||
|
entry.buffer = fetched.buffer;
|
||||||
|
entry.bufferMime =
|
||||||
|
entry.attachment.mime ??
|
||||||
|
fetched.contentType ??
|
||||||
|
(await detectMime({
|
||||||
|
buffer: fetched.buffer,
|
||||||
|
filePath: fetched.fileName ?? url,
|
||||||
|
}));
|
||||||
|
entry.bufferFileName = fetched.fileName ?? `media-${params.attachmentIndex + 1}`;
|
||||||
|
return {
|
||||||
|
buffer: fetched.buffer,
|
||||||
|
mime: entry.bufferMime,
|
||||||
|
fileName: entry.bufferFileName,
|
||||||
|
size: fetched.buffer.length,
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof MediaFetchError && err.code === "max_bytes") {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"maxBytes",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (isAbortError(err)) {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"timeout",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} timed out while fetching.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async getPath(params: {
|
||||||
|
attachmentIndex: number;
|
||||||
|
maxBytes?: number;
|
||||||
|
timeoutMs: number;
|
||||||
|
}): Promise<MediaPathResult> {
|
||||||
|
const entry = await this.ensureEntry(params.attachmentIndex);
|
||||||
|
if (entry.resolvedPath) {
|
||||||
|
if (params.maxBytes) {
|
||||||
|
const size = await this.ensureLocalStat(entry);
|
||||||
|
if (entry.resolvedPath) {
|
||||||
|
if (size !== undefined && size > params.maxBytes) {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"maxBytes",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (entry.resolvedPath) {
|
||||||
|
return { path: entry.resolvedPath };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entry.tempPath) {
|
||||||
|
if (params.maxBytes && entry.buffer && entry.buffer.length > params.maxBytes) {
|
||||||
|
throw new MediaUnderstandingSkipError(
|
||||||
|
"maxBytes",
|
||||||
|
`Attachment ${params.attachmentIndex + 1} exceeds maxBytes ${params.maxBytes}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return { path: entry.tempPath, cleanup: entry.tempCleanup };
|
||||||
|
}
|
||||||
|
|
||||||
|
const maxBytes = params.maxBytes ?? Number.POSITIVE_INFINITY;
|
||||||
|
const bufferResult = await this.getBuffer({
|
||||||
|
attachmentIndex: params.attachmentIndex,
|
||||||
|
maxBytes,
|
||||||
|
timeoutMs: params.timeoutMs,
|
||||||
|
});
|
||||||
|
const extension = path.extname(bufferResult.fileName || "") || "";
|
||||||
|
const tmpPath = path.join(
|
||||||
|
os.tmpdir(),
|
||||||
|
`clawdbot-media-${crypto.randomUUID()}${extension}`,
|
||||||
|
);
|
||||||
|
await fs.writeFile(tmpPath, bufferResult.buffer);
|
||||||
|
entry.tempPath = tmpPath;
|
||||||
|
entry.tempCleanup = async () => {
|
||||||
|
await fs.unlink(tmpPath).catch(() => {});
|
||||||
|
};
|
||||||
|
return { path: tmpPath, cleanup: entry.tempCleanup };
|
||||||
|
}
|
||||||
|
|
||||||
|
async cleanup(): Promise<void> {
|
||||||
|
const cleanups: Array<Promise<void> | void> = [];
|
||||||
|
for (const entry of this.entries.values()) {
|
||||||
|
if (entry.tempCleanup) {
|
||||||
|
cleanups.push(Promise.resolve(entry.tempCleanup()));
|
||||||
|
entry.tempCleanup = undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
await Promise.all(cleanups);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async ensureEntry(attachmentIndex: number): Promise<AttachmentCacheEntry> {
|
||||||
|
const existing = this.entries.get(attachmentIndex);
|
||||||
|
if (existing) {
|
||||||
|
if (!existing.resolvedPath) {
|
||||||
|
existing.resolvedPath = this.resolveLocalPath(existing.attachment);
|
||||||
|
}
|
||||||
|
return existing;
|
||||||
|
}
|
||||||
|
const attachment =
|
||||||
|
this.attachments.find((item) => item.index === attachmentIndex) ?? { index: attachmentIndex };
|
||||||
|
const entry: AttachmentCacheEntry = {
|
||||||
|
attachment,
|
||||||
|
resolvedPath: this.resolveLocalPath(attachment),
|
||||||
|
};
|
||||||
|
this.entries.set(attachmentIndex, entry);
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
private resolveLocalPath(attachment: MediaAttachment): string | undefined {
|
||||||
|
const rawPath = normalizeAttachmentPath(attachment.path);
|
||||||
|
if (!rawPath) return undefined;
|
||||||
|
return path.isAbsolute(rawPath) ? rawPath : path.resolve(rawPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async ensureLocalStat(entry: AttachmentCacheEntry): Promise<number | undefined> {
|
||||||
|
if (!entry.resolvedPath) return undefined;
|
||||||
|
if (entry.statSize !== undefined) return entry.statSize;
|
||||||
|
try {
|
||||||
|
const stat = await fs.stat(entry.resolvedPath);
|
||||||
|
if (!stat.isFile()) {
|
||||||
|
entry.resolvedPath = undefined;
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
entry.statSize = stat.size;
|
||||||
|
return stat.size;
|
||||||
|
} catch (err) {
|
||||||
|
entry.resolvedPath = undefined;
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Failed to read attachment ${entry.attachment.index + 1}: ${String(err)}`);
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
29
src/media-understanding/concurrency.ts
Normal file
29
src/media-understanding/concurrency.ts
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||||
|
|
||||||
|
export async function runWithConcurrency<T>(
|
||||||
|
tasks: Array<() => Promise<T>>,
|
||||||
|
limit: number,
|
||||||
|
): Promise<T[]> {
|
||||||
|
if (tasks.length === 0) return [];
|
||||||
|
const resolvedLimit = Math.max(1, Math.min(limit, tasks.length));
|
||||||
|
const results: T[] = Array.from({ length: tasks.length });
|
||||||
|
let next = 0;
|
||||||
|
|
||||||
|
const workers = Array.from({ length: resolvedLimit }, async () => {
|
||||||
|
while (true) {
|
||||||
|
const index = next;
|
||||||
|
next += 1;
|
||||||
|
if (index >= tasks.length) return;
|
||||||
|
try {
|
||||||
|
results[index] = await tasks[index]();
|
||||||
|
} catch (err) {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(`Media understanding task failed: ${String(err)}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.allSettled(workers);
|
||||||
|
return results;
|
||||||
|
}
|
||||||
35
src/media-understanding/defaults.ts
Normal file
35
src/media-understanding/defaults.ts
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import type { MediaUnderstandingCapability } from "./types.js";
|
||||||
|
|
||||||
|
const MB = 1024 * 1024;
|
||||||
|
|
||||||
|
export const DEFAULT_MAX_CHARS = 500;
|
||||||
|
export const DEFAULT_MAX_CHARS_BY_CAPABILITY: Record<
|
||||||
|
MediaUnderstandingCapability,
|
||||||
|
number | undefined
|
||||||
|
> = {
|
||||||
|
image: DEFAULT_MAX_CHARS,
|
||||||
|
audio: undefined,
|
||||||
|
video: DEFAULT_MAX_CHARS,
|
||||||
|
};
|
||||||
|
export const DEFAULT_MAX_BYTES: Record<MediaUnderstandingCapability, number> = {
|
||||||
|
image: 10 * MB,
|
||||||
|
audio: 20 * MB,
|
||||||
|
video: 50 * MB,
|
||||||
|
};
|
||||||
|
export const DEFAULT_TIMEOUT_SECONDS: Record<MediaUnderstandingCapability, number> = {
|
||||||
|
image: 60,
|
||||||
|
audio: 60,
|
||||||
|
video: 120,
|
||||||
|
};
|
||||||
|
export const DEFAULT_PROMPT: Record<MediaUnderstandingCapability, string> = {
|
||||||
|
image: "Describe the image.",
|
||||||
|
audio: "Transcribe the audio.",
|
||||||
|
video: "Describe the video.",
|
||||||
|
};
|
||||||
|
export const DEFAULT_VIDEO_MAX_BASE64_BYTES = 70 * MB;
|
||||||
|
export const DEFAULT_AUDIO_MODELS: Record<string, string> = {
|
||||||
|
groq: "whisper-large-v3-turbo",
|
||||||
|
openai: "whisper-1",
|
||||||
|
};
|
||||||
|
export const CLI_OUTPUT_MAX_BUFFER = 5 * MB;
|
||||||
|
export const DEFAULT_MEDIA_CONCURRENCY = 2;
|
||||||
17
src/media-understanding/errors.ts
Normal file
17
src/media-understanding/errors.ts
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
export type MediaUnderstandingSkipReason = "maxBytes" | "timeout" | "unsupported" | "empty";
|
||||||
|
|
||||||
|
export class MediaUnderstandingSkipError extends Error {
|
||||||
|
readonly reason: MediaUnderstandingSkipReason;
|
||||||
|
|
||||||
|
constructor(reason: MediaUnderstandingSkipReason, message: string) {
|
||||||
|
super(message);
|
||||||
|
this.reason = reason;
|
||||||
|
this.name = "MediaUnderstandingSkipError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function isMediaUnderstandingSkipError(
|
||||||
|
err: unknown,
|
||||||
|
): err is MediaUnderstandingSkipError {
|
||||||
|
return err instanceof MediaUnderstandingSkipError;
|
||||||
|
}
|
||||||
@@ -12,7 +12,7 @@ export function extractMediaUserText(body?: string): string | undefined {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function formatSection(
|
function formatSection(
|
||||||
title: "Audio" | "Video" | "Image",
|
title: string,
|
||||||
kind: "Transcript" | "Description",
|
kind: "Transcript" | "Description",
|
||||||
text: string,
|
text: string,
|
||||||
userText?: string,
|
userText?: string,
|
||||||
@@ -40,11 +40,21 @@ export function formatMediaUnderstandingBody(params: {
|
|||||||
sections.push(`User text:\n${userText}`);
|
sections.push(`User text:\n${userText}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const counts = new Map<MediaUnderstandingOutput["kind"], number>();
|
||||||
for (const output of outputs) {
|
for (const output of outputs) {
|
||||||
|
counts.set(output.kind, (counts.get(output.kind) ?? 0) + 1);
|
||||||
|
}
|
||||||
|
const seen = new Map<MediaUnderstandingOutput["kind"], number>();
|
||||||
|
|
||||||
|
for (const output of outputs) {
|
||||||
|
const count = counts.get(output.kind) ?? 1;
|
||||||
|
const next = (seen.get(output.kind) ?? 0) + 1;
|
||||||
|
seen.set(output.kind, next);
|
||||||
|
const suffix = count > 1 ? ` ${next}/${count}` : "";
|
||||||
if (output.kind === "audio.transcription") {
|
if (output.kind === "audio.transcription") {
|
||||||
sections.push(
|
sections.push(
|
||||||
formatSection(
|
formatSection(
|
||||||
"Audio",
|
`Audio${suffix}`,
|
||||||
"Transcript",
|
"Transcript",
|
||||||
output.text,
|
output.text,
|
||||||
outputs.length === 1 ? userText : undefined,
|
outputs.length === 1 ? userText : undefined,
|
||||||
@@ -55,7 +65,7 @@ export function formatMediaUnderstandingBody(params: {
|
|||||||
if (output.kind === "image.description") {
|
if (output.kind === "image.description") {
|
||||||
sections.push(
|
sections.push(
|
||||||
formatSection(
|
formatSection(
|
||||||
"Image",
|
`Image${suffix}`,
|
||||||
"Description",
|
"Description",
|
||||||
output.text,
|
output.text,
|
||||||
outputs.length === 1 ? userText : undefined,
|
outputs.length === 1 ? userText : undefined,
|
||||||
@@ -65,7 +75,7 @@ export function formatMediaUnderstandingBody(params: {
|
|||||||
}
|
}
|
||||||
sections.push(
|
sections.push(
|
||||||
formatSection(
|
formatSection(
|
||||||
"Video",
|
`Video${suffix}`,
|
||||||
"Description",
|
"Description",
|
||||||
output.text,
|
output.text,
|
||||||
outputs.length === 1 ? userText : undefined,
|
outputs.length === 1 ? userText : undefined,
|
||||||
@@ -75,3 +85,10 @@ export function formatMediaUnderstandingBody(params: {
|
|||||||
|
|
||||||
return sections.join("\n\n").trim();
|
return sections.join("\n\n").trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function formatAudioTranscripts(outputs: MediaUnderstandingOutput[]): string {
|
||||||
|
if (outputs.length === 1) return outputs[0].text;
|
||||||
|
return outputs
|
||||||
|
.map((output, index) => `Audio ${index + 1}:\n${output.text}`)
|
||||||
|
.join("\n\n");
|
||||||
|
}
|
||||||
|
|||||||
7
src/media-understanding/providers/anthropic/index.ts
Normal file
7
src/media-understanding/providers/anthropic/index.ts
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { describeImageWithModel } from "../image.js";
|
||||||
|
|
||||||
|
export const anthropicProvider: MediaUnderstandingProvider = {
|
||||||
|
id: "anthropic",
|
||||||
|
describeImage: describeImageWithModel,
|
||||||
|
};
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { describeImageWithModel } from "../image.js";
|
||||||
import { describeGeminiVideo } from "./video.js";
|
import { describeGeminiVideo } from "./video.js";
|
||||||
|
|
||||||
export const googleProvider: MediaUnderstandingProvider = {
|
export const googleProvider: MediaUnderstandingProvider = {
|
||||||
id: "google",
|
id: "google",
|
||||||
|
describeImage: describeImageWithModel,
|
||||||
describeVideo: describeGeminiVideo,
|
describeVideo: describeGeminiVideo,
|
||||||
};
|
};
|
||||||
|
|||||||
66
src/media-understanding/providers/image.ts
Normal file
66
src/media-understanding/providers/image.ts
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import type { Api, AssistantMessage, Context, Model } from "@mariozechner/pi-ai";
|
||||||
|
import { complete } from "@mariozechner/pi-ai";
|
||||||
|
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
|
||||||
|
|
||||||
|
import { getApiKeyForModel } from "../../agents/model-auth.js";
|
||||||
|
import { ensureClawdbotModelsJson } from "../../agents/models-config.js";
|
||||||
|
import { minimaxUnderstandImage } from "../../agents/minimax-vlm.js";
|
||||||
|
import { coerceImageAssistantText } from "../../agents/tools/image-tool.helpers.js";
|
||||||
|
import type { ImageDescriptionRequest, ImageDescriptionResult } from "../types.js";
|
||||||
|
|
||||||
|
export async function describeImageWithModel(
|
||||||
|
params: ImageDescriptionRequest,
|
||||||
|
): Promise<ImageDescriptionResult> {
|
||||||
|
await ensureClawdbotModelsJson(params.cfg, params.agentDir);
|
||||||
|
const authStorage = discoverAuthStorage(params.agentDir);
|
||||||
|
const modelRegistry = discoverModels(authStorage, params.agentDir);
|
||||||
|
const model = modelRegistry.find(params.provider, params.model) as Model<Api> | null;
|
||||||
|
if (!model) {
|
||||||
|
throw new Error(`Unknown model: ${params.provider}/${params.model}`);
|
||||||
|
}
|
||||||
|
if (!model.input?.includes("image")) {
|
||||||
|
throw new Error(`Model does not support images: ${params.provider}/${params.model}`);
|
||||||
|
}
|
||||||
|
const apiKeyInfo = await getApiKeyForModel({
|
||||||
|
model,
|
||||||
|
cfg: params.cfg,
|
||||||
|
agentDir: params.agentDir,
|
||||||
|
profileId: params.profile,
|
||||||
|
preferredProfile: params.preferredProfile,
|
||||||
|
});
|
||||||
|
authStorage.setRuntimeApiKey(model.provider, apiKeyInfo.apiKey);
|
||||||
|
|
||||||
|
const base64 = params.buffer.toString("base64");
|
||||||
|
if (model.provider === "minimax") {
|
||||||
|
const text = await minimaxUnderstandImage({
|
||||||
|
apiKey: apiKeyInfo.apiKey,
|
||||||
|
prompt: params.prompt ?? "Describe the image.",
|
||||||
|
imageDataUrl: `data:${params.mime ?? "image/jpeg"};base64,${base64}`,
|
||||||
|
modelBaseUrl: model.baseUrl,
|
||||||
|
});
|
||||||
|
return { text, model: model.id };
|
||||||
|
}
|
||||||
|
|
||||||
|
const context: Context = {
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: [
|
||||||
|
{ type: "text", text: params.prompt ?? "Describe the image." },
|
||||||
|
{ type: "image", data: base64, mimeType: params.mime ?? "image/jpeg" },
|
||||||
|
],
|
||||||
|
timestamp: Date.now(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
const message = (await complete(model, context, {
|
||||||
|
apiKey: apiKeyInfo.apiKey,
|
||||||
|
maxTokens: params.maxTokens ?? 512,
|
||||||
|
})) as AssistantMessage;
|
||||||
|
const text = coerceImageAssistantText({
|
||||||
|
message,
|
||||||
|
provider: model.provider,
|
||||||
|
model: model.id,
|
||||||
|
});
|
||||||
|
return { text, model: model.id };
|
||||||
|
}
|
||||||
@@ -1,10 +1,18 @@
|
|||||||
import { normalizeProviderId } from "../../agents/model-selection.js";
|
import { normalizeProviderId } from "../../agents/model-selection.js";
|
||||||
import type { MediaUnderstandingProvider } from "../types.js";
|
import type { MediaUnderstandingProvider } from "../types.js";
|
||||||
|
import { anthropicProvider } from "./anthropic/index.js";
|
||||||
import { googleProvider } from "./google/index.js";
|
import { googleProvider } from "./google/index.js";
|
||||||
import { groqProvider } from "./groq/index.js";
|
import { groqProvider } from "./groq/index.js";
|
||||||
|
import { minimaxProvider } from "./minimax/index.js";
|
||||||
import { openaiProvider } from "./openai/index.js";
|
import { openaiProvider } from "./openai/index.js";
|
||||||
|
|
||||||
const PROVIDERS: MediaUnderstandingProvider[] = [groqProvider, openaiProvider, googleProvider];
|
const PROVIDERS: MediaUnderstandingProvider[] = [
|
||||||
|
groqProvider,
|
||||||
|
openaiProvider,
|
||||||
|
googleProvider,
|
||||||
|
anthropicProvider,
|
||||||
|
minimaxProvider,
|
||||||
|
];
|
||||||
|
|
||||||
export function normalizeMediaProviderId(id: string): string {
|
export function normalizeMediaProviderId(id: string): string {
|
||||||
const normalized = normalizeProviderId(id);
|
const normalized = normalizeProviderId(id);
|
||||||
|
|||||||
7
src/media-understanding/providers/minimax/index.ts
Normal file
7
src/media-understanding/providers/minimax/index.ts
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { describeImageWithModel } from "../image.js";
|
||||||
|
|
||||||
|
export const minimaxProvider: MediaUnderstandingProvider = {
|
||||||
|
id: "minimax",
|
||||||
|
describeImage: describeImageWithModel,
|
||||||
|
};
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
import type { MediaUnderstandingProvider } from "../../types.js";
|
import type { MediaUnderstandingProvider } from "../../types.js";
|
||||||
|
import { describeImageWithModel } from "../image.js";
|
||||||
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
import { transcribeOpenAiCompatibleAudio } from "./audio.js";
|
||||||
|
|
||||||
export const openaiProvider: MediaUnderstandingProvider = {
|
export const openaiProvider: MediaUnderstandingProvider = {
|
||||||
id: "openai",
|
id: "openai",
|
||||||
|
describeImage: describeImageWithModel,
|
||||||
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
transcribeAudio: transcribeOpenAiCompatibleAudio,
|
||||||
};
|
};
|
||||||
|
|||||||
154
src/media-understanding/resolve.ts
Normal file
154
src/media-understanding/resolve.ts
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
import type { ClawdbotConfig } from "../config/config.js";
|
||||||
|
import type { MsgContext } from "../auto-reply/templating.js";
|
||||||
|
import type {
|
||||||
|
MediaUnderstandingConfig,
|
||||||
|
MediaUnderstandingModelConfig,
|
||||||
|
MediaUnderstandingScopeConfig,
|
||||||
|
} from "../config/types.tools.js";
|
||||||
|
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
||||||
|
import {
|
||||||
|
DEFAULT_MAX_BYTES,
|
||||||
|
DEFAULT_MAX_CHARS_BY_CAPABILITY,
|
||||||
|
DEFAULT_MEDIA_CONCURRENCY,
|
||||||
|
DEFAULT_PROMPT,
|
||||||
|
} from "./defaults.js";
|
||||||
|
import { normalizeMediaProviderId } from "./providers/index.js";
|
||||||
|
import { normalizeMediaUnderstandingChatType, resolveMediaUnderstandingScope } from "./scope.js";
|
||||||
|
import type { MediaUnderstandingCapability } from "./types.js";
|
||||||
|
|
||||||
|
export function resolveTimeoutMs(seconds: number | undefined, fallbackSeconds: number): number {
|
||||||
|
const value = typeof seconds === "number" && Number.isFinite(seconds) ? seconds : fallbackSeconds;
|
||||||
|
return Math.max(1000, Math.floor(value * 1000));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolvePrompt(
|
||||||
|
capability: MediaUnderstandingCapability,
|
||||||
|
prompt?: string,
|
||||||
|
maxChars?: number,
|
||||||
|
): string {
|
||||||
|
const base = prompt?.trim() || DEFAULT_PROMPT[capability];
|
||||||
|
if (!maxChars || capability === "audio") return base;
|
||||||
|
return `${base} Respond in at most ${maxChars} characters.`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveMaxChars(params: {
|
||||||
|
capability: MediaUnderstandingCapability;
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
config?: MediaUnderstandingConfig;
|
||||||
|
}): number | undefined {
|
||||||
|
const { capability, entry, cfg } = params;
|
||||||
|
const configured =
|
||||||
|
entry.maxChars ?? params.config?.maxChars ?? cfg.tools?.media?.[capability]?.maxChars;
|
||||||
|
if (typeof configured === "number") return configured;
|
||||||
|
return DEFAULT_MAX_CHARS_BY_CAPABILITY[capability];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveMaxBytes(params: {
|
||||||
|
capability: MediaUnderstandingCapability;
|
||||||
|
entry: MediaUnderstandingModelConfig;
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
config?: MediaUnderstandingConfig;
|
||||||
|
}): number {
|
||||||
|
const configured =
|
||||||
|
params.entry.maxBytes ??
|
||||||
|
params.config?.maxBytes ??
|
||||||
|
params.cfg.tools?.media?.[params.capability]?.maxBytes;
|
||||||
|
if (typeof configured === "number") return configured;
|
||||||
|
return DEFAULT_MAX_BYTES[params.capability];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveCapabilityConfig(
|
||||||
|
cfg: ClawdbotConfig,
|
||||||
|
capability: MediaUnderstandingCapability,
|
||||||
|
): MediaUnderstandingConfig | undefined {
|
||||||
|
return cfg.tools?.media?.[capability];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveScopeDecision(params: {
|
||||||
|
scope?: MediaUnderstandingScopeConfig;
|
||||||
|
ctx: MsgContext;
|
||||||
|
}): "allow" | "deny" {
|
||||||
|
return resolveMediaUnderstandingScope({
|
||||||
|
scope: params.scope,
|
||||||
|
sessionKey: params.ctx.SessionKey,
|
||||||
|
channel: params.ctx.Surface ?? params.ctx.Provider,
|
||||||
|
chatType: normalizeMediaUnderstandingChatType(params.ctx.ChatType),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function inferCapabilities(
|
||||||
|
entry: MediaUnderstandingModelConfig,
|
||||||
|
): MediaUnderstandingCapability[] | undefined {
|
||||||
|
if ((entry.type ?? (entry.command ? "cli" : "provider")) === "cli") {
|
||||||
|
return ["image", "audio", "video"];
|
||||||
|
}
|
||||||
|
const provider = normalizeMediaProviderId(entry.provider ?? "");
|
||||||
|
if (!provider) return undefined;
|
||||||
|
if (provider === "openai" || provider === "anthropic" || provider === "minimax") {
|
||||||
|
return ["image"];
|
||||||
|
}
|
||||||
|
if (provider === "google") {
|
||||||
|
return ["image", "audio", "video"];
|
||||||
|
}
|
||||||
|
if (provider === "groq") {
|
||||||
|
return ["audio"];
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveModelEntries(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
capability: MediaUnderstandingCapability;
|
||||||
|
config?: MediaUnderstandingConfig;
|
||||||
|
}): MediaUnderstandingModelConfig[] {
|
||||||
|
const { cfg, capability, config } = params;
|
||||||
|
const sharedModels = cfg.tools?.media?.models ?? [];
|
||||||
|
const entries = [
|
||||||
|
...(config?.models ?? []).map((entry) => ({ entry, source: "capability" as const })),
|
||||||
|
...sharedModels.map((entry) => ({ entry, source: "shared" as const })),
|
||||||
|
];
|
||||||
|
if (entries.length === 0) return [];
|
||||||
|
|
||||||
|
return entries
|
||||||
|
.filter(({ entry, source }) => {
|
||||||
|
const caps =
|
||||||
|
entry.capabilities && entry.capabilities.length > 0
|
||||||
|
? entry.capabilities
|
||||||
|
: source === "shared"
|
||||||
|
? inferCapabilities(entry)
|
||||||
|
: undefined;
|
||||||
|
if (!caps || caps.length === 0) {
|
||||||
|
if (source === "shared") {
|
||||||
|
if (shouldLogVerbose()) {
|
||||||
|
logVerbose(
|
||||||
|
`Skipping shared media model without capabilities: ${entry.provider ?? entry.command ?? "unknown"}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return caps.includes(capability);
|
||||||
|
})
|
||||||
|
.map(({ entry }) => entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveConcurrency(cfg: ClawdbotConfig): number {
|
||||||
|
const configured = cfg.tools?.media?.concurrency;
|
||||||
|
if (typeof configured === "number" && Number.isFinite(configured) && configured > 0) {
|
||||||
|
return Math.floor(configured);
|
||||||
|
}
|
||||||
|
return DEFAULT_MEDIA_CONCURRENCY;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveCapabilityEnabled(params: {
|
||||||
|
cfg: ClawdbotConfig;
|
||||||
|
config?: MediaUnderstandingConfig;
|
||||||
|
}): boolean {
|
||||||
|
if (params.config?.enabled === false) return false;
|
||||||
|
const sharedModels = params.cfg.tools?.media?.models ?? [];
|
||||||
|
const hasModels = (params.config?.models?.length ?? 0) > 0 || sharedModels.length > 0;
|
||||||
|
if (!hasModels) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
@@ -3,6 +3,8 @@ export type MediaUnderstandingKind =
|
|||||||
| "video.description"
|
| "video.description"
|
||||||
| "image.description";
|
| "image.description";
|
||||||
|
|
||||||
|
export type MediaUnderstandingCapability = "image" | "audio" | "video";
|
||||||
|
|
||||||
export type MediaAttachment = {
|
export type MediaAttachment = {
|
||||||
path?: string;
|
path?: string;
|
||||||
url?: string;
|
url?: string;
|
||||||
@@ -55,8 +57,29 @@ export type VideoDescriptionResult = {
|
|||||||
model?: string;
|
model?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type ImageDescriptionRequest = {
|
||||||
|
buffer: Buffer;
|
||||||
|
fileName: string;
|
||||||
|
mime?: string;
|
||||||
|
model: string;
|
||||||
|
provider: string;
|
||||||
|
prompt?: string;
|
||||||
|
maxTokens?: number;
|
||||||
|
timeoutMs: number;
|
||||||
|
profile?: string;
|
||||||
|
preferredProfile?: string;
|
||||||
|
agentDir: string;
|
||||||
|
cfg: import("../config/config.js").ClawdbotConfig;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type ImageDescriptionResult = {
|
||||||
|
text: string;
|
||||||
|
model?: string;
|
||||||
|
};
|
||||||
|
|
||||||
export type MediaUnderstandingProvider = {
|
export type MediaUnderstandingProvider = {
|
||||||
id: string;
|
id: string;
|
||||||
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
transcribeAudio?: (req: AudioTranscriptionRequest) => Promise<AudioTranscriptionResult>;
|
||||||
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
describeVideo?: (req: VideoDescriptionRequest) => Promise<VideoDescriptionResult>;
|
||||||
|
describeImage?: (req: ImageDescriptionRequest) => Promise<ImageDescriptionResult>;
|
||||||
};
|
};
|
||||||
|
|||||||
10
src/media-understanding/video.ts
Normal file
10
src/media-understanding/video.ts
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
import { DEFAULT_VIDEO_MAX_BASE64_BYTES } from "./defaults.js";
|
||||||
|
|
||||||
|
export function estimateBase64Size(bytes: number): number {
|
||||||
|
return Math.ceil(bytes / 3) * 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function resolveVideoMaxBase64Bytes(maxBytes: number): number {
|
||||||
|
const expanded = Math.floor(maxBytes * (4 / 3));
|
||||||
|
return Math.min(expanded, DEFAULT_VIDEO_MAX_BASE64_BYTES);
|
||||||
|
}
|
||||||
@@ -8,6 +8,18 @@ type FetchMediaResult = {
|
|||||||
fileName?: string;
|
fileName?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type MediaFetchErrorCode = "max_bytes" | "http_error" | "fetch_failed";
|
||||||
|
|
||||||
|
export class MediaFetchError extends Error {
|
||||||
|
readonly code: MediaFetchErrorCode;
|
||||||
|
|
||||||
|
constructor(code: MediaFetchErrorCode, message: string) {
|
||||||
|
super(message);
|
||||||
|
this.code = code;
|
||||||
|
this.name = "MediaFetchError";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export type FetchLike = (input: RequestInfo | URL, init?: RequestInit) => Promise<Response>;
|
export type FetchLike = (input: RequestInfo | URL, init?: RequestInit) => Promise<Response>;
|
||||||
|
|
||||||
type FetchMediaOptions = {
|
type FetchMediaOptions = {
|
||||||
@@ -62,7 +74,7 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
|
|||||||
try {
|
try {
|
||||||
res = await fetcher(url);
|
res = await fetcher(url);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
throw new Error(`Failed to fetch media from ${url}: ${String(err)}`);
|
throw new MediaFetchError("fetch_failed", `Failed to fetch media from ${url}: ${String(err)}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
@@ -75,14 +87,18 @@ export async function fetchRemoteMedia(options: FetchMediaOptions): Promise<Fetc
|
|||||||
const snippet = await readErrorBodySnippet(res);
|
const snippet = await readErrorBodySnippet(res);
|
||||||
if (snippet) detail += `; body: ${snippet}`;
|
if (snippet) detail += `; body: ${snippet}`;
|
||||||
}
|
}
|
||||||
throw new Error(`Failed to fetch media from ${url}${redirected}: ${detail}`);
|
throw new MediaFetchError(
|
||||||
|
"http_error",
|
||||||
|
`Failed to fetch media from ${url}${redirected}: ${detail}`,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentLength = res.headers.get("content-length");
|
const contentLength = res.headers.get("content-length");
|
||||||
if (maxBytes && contentLength) {
|
if (maxBytes && contentLength) {
|
||||||
const length = Number(contentLength);
|
const length = Number(contentLength);
|
||||||
if (Number.isFinite(length) && length > maxBytes) {
|
if (Number.isFinite(length) && length > maxBytes) {
|
||||||
throw new Error(
|
throw new MediaFetchError(
|
||||||
|
"max_bytes",
|
||||||
`Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
|
`Failed to fetch media from ${url}: content length ${length} exceeds maxBytes ${maxBytes}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -128,7 +144,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise<B
|
|||||||
if (!body || typeof body.getReader !== "function") {
|
if (!body || typeof body.getReader !== "function") {
|
||||||
const fallback = Buffer.from(await res.arrayBuffer());
|
const fallback = Buffer.from(await res.arrayBuffer());
|
||||||
if (fallback.length > maxBytes) {
|
if (fallback.length > maxBytes) {
|
||||||
throw new Error(
|
throw new MediaFetchError(
|
||||||
|
"max_bytes",
|
||||||
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -148,7 +165,8 @@ async function readResponseWithLimit(res: Response, maxBytes: number): Promise<B
|
|||||||
try {
|
try {
|
||||||
await reader.cancel();
|
await reader.cancel();
|
||||||
} catch {}
|
} catch {}
|
||||||
throw new Error(
|
throw new MediaFetchError(
|
||||||
|
"max_bytes",
|
||||||
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
`Failed to fetch media from ${res.url || "response"}: payload exceeds maxBytes ${maxBytes}`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user