fix: gate image tool and deepgram audio payload

2026-01-17 09:33:53 +00:00
parent d66bc65ca6
commit b6ea5895b6
9 changed files with 139 additions and 31 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -81,6 +81,7 @@ Docs: https://docs.clawd.bot
 - WhatsApp: scope self-chat response prefix; inject pending-only group history and clear after any processed message.
 - WhatsApp: include `linked` field in `describeAccount`.
 - Agents: drop unsigned Gemini tool calls and avoid JSON Schema `format` keyword collisions.
+- Agents: hide the image tool when the primary model already supports images.
 - Agents: avoid duplicate sends by replying with `NO_REPLY` after `message` tool sends.
 - Auth: inherit/merge sub-agent auth profiles from the main agent.
 - Gateway: resolve local auth for security probe and validate gateway token/password file modes. (#1011, #1022) — thanks @ivanrvpereira, @kkarimi.
--- a/src/agents/tools/image-tool.test.ts
+++ b/src/agents/tools/image-tool.test.ts
@@ -102,6 +102,27 @@ describe("image tool implicit imageModel config", () => {
    });
  });

+  it("disables image tool when primary model already supports images", async () => {
+    const agentDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-image-"));
+    const cfg: ClawdbotConfig = {
+      agents: {
+        defaults: {
+          model: { primary: "acme/vision-1" },
+          imageModel: { primary: "openai/gpt-5-mini" },
+        },
+      },
+      models: {
+        providers: {
+          acme: {
+            models: [{ id: "vision-1", input: ["text", "image"] }],
+          },
+        },
+      },
+    };
+    expect(resolveImageModelConfigForTool({ cfg, agentDir })).toBeNull();
+    expect(createImageTool({ config: cfg, agentDir })).toBeNull();
+  });
+
  it("sandboxes image paths like the read tool", async () => {
    const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-image-sandbox-"));
    const agentDir = path.join(stateDir, "agent");
--- a/src/agents/tools/image-tool.ts
+++ b/src/agents/tools/image-tool.ts
@@ -1,3 +1,4 @@
+import fsSync from "node:fs";
 import fs from "node:fs/promises";
 import path from "node:path";

@@ -19,7 +20,7 @@ import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "../defaults.js";
 import { minimaxUnderstandImage } from "../minimax-vlm.js";
 import { getApiKeyForModel, resolveEnvApiKey } from "../model-auth.js";
 import { runWithImageModelFallback } from "../model-fallback.js";
-import { parseModelRef } from "../model-selection.js";
+import { normalizeProviderId, resolveConfiguredModelRef } from "../model-selection.js";
 import { ensureClawdbotModelsJson } from "../models-config.js";
 import { assertSandboxPath } from "../sandbox-paths.js";
 import type { AnyAgentTool } from "./common.js";
@@ -42,12 +43,15 @@ function resolveDefaultModelRef(cfg?: ClawdbotConfig): {
  provider: string;
  model: string;
 } {
-  const modelConfig = cfg?.agents?.defaults?.model as { primary?: string } | string | undefined;
-  const raw = typeof modelConfig === "string" ? modelConfig.trim() : modelConfig?.primary?.trim();
-  const parsed =
-    parseModelRef(raw ?? "", DEFAULT_PROVIDER) ??
-    ({ provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL } as const);
-  return { provider: parsed.provider, model: parsed.model };
+  if (cfg) {
+    const resolved = resolveConfiguredModelRef({
+      cfg,
+      defaultProvider: DEFAULT_PROVIDER,
+      defaultModel: DEFAULT_MODEL,
+    });
+    return { provider: resolved.provider, model: resolved.model };
+  }
+  return { provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL };
 }

 function hasAuthForProvider(params: { provider: string; agentDir: string }): boolean {
@@ -58,6 +62,77 @@ function hasAuthForProvider(params: { provider: string; agentDir: string }): boo
  return listProfilesForProvider(store, params.provider).length > 0;
 }

+type ProviderModelEntry = {
+  id?: string;
+  input?: string[];
+};
+
+type ProviderConfigLike = {
+  models?: ProviderModelEntry[];
+};
+
+function resolveProviderConfig(
+  providers: Record<string, ProviderConfigLike> | undefined,
+  provider: string,
+): ProviderConfigLike | null {
+  if (!providers) return null;
+  const normalized = normalizeProviderId(provider);
+  for (const [key, value] of Object.entries(providers)) {
+    if (normalizeProviderId(key) === normalized) return value;
+  }
+  return null;
+}
+
+function resolveModelSupportsImages(params: {
+  providerConfig: ProviderConfigLike | null;
+  modelId: string;
+}): boolean | null {
+  const models = params.providerConfig?.models;
+  if (!Array.isArray(models) || models.length === 0) return null;
+  const trimmedId = params.modelId.trim();
+  if (!trimmedId) return null;
+  const match =
+    models.find((model) => String(model?.id ?? "").trim() === trimmedId) ??
+    models.find(
+      (model) =>
+        String(model?.id ?? "")
+          .trim()
+          .toLowerCase() === trimmedId.toLowerCase(),
+    );
+  if (!match) return null;
+  const input = Array.isArray(match.input) ? match.input : [];
+  return input.includes("image");
+}
+
+function resolvePrimaryModelSupportsImages(params: {
+  cfg?: ClawdbotConfig;
+  agentDir: string;
+}): boolean | null {
+  if (!params.cfg) return null;
+  const primary = resolveDefaultModelRef(params.cfg);
+  const providerConfig = resolveProviderConfig(
+    params.cfg.models?.providers as Record<string, ProviderConfigLike> | undefined,
+    primary.provider,
+  );
+  const fromConfig = resolveModelSupportsImages({
+    providerConfig,
+    modelId: primary.model,
+  });
+  if (fromConfig !== null) return fromConfig;
+  try {
+    const modelsPath = path.join(params.agentDir, "models.json");
+    const raw = fsSync.readFileSync(modelsPath, "utf8");
+    const parsed = JSON.parse(raw) as { providers?: Record<string, ProviderConfigLike> };
+    const provider = resolveProviderConfig(parsed.providers, primary.provider);
+    return resolveModelSupportsImages({
+      providerConfig: provider,
+      modelId: primary.model,
+    });
+  } catch {
+    return null;
+  }
+}
+
 /**
 * Resolve the effective image model config for the `image` tool.
 *
@@ -70,6 +145,11 @@ export function resolveImageModelConfigForTool(params: {
  cfg?: ClawdbotConfig;
  agentDir: string;
 }): ImageModelConfig | null {
+  const primarySupportsImages = resolvePrimaryModelSupportsImages({
+    cfg: params.cfg,
+    agentDir: params.agentDir,
+  });
+  if (primarySupportsImages === true) return null;
  const explicit = coerceImageModelConfig(params.cfg);
  if (explicit.primary?.trim() || (explicit.fallbacks?.length ?? 0) > 0) {
    return explicit;
--- a/src/auto-reply/reply.directive.directive-behavior.supports-fuzzy-model-matches-model-directive.test.ts
+++ b/src/auto-reply/reply.directive.directive-behavior.supports-fuzzy-model-matches-model-directive.test.ts
@@ -107,7 +107,12 @@ describe("directive behavior", () => {
      const storePath = path.join(home, "sessions.json");

      await getReplyFromConfig(
-        { Body: "/model kimi-k2-0905-preview", From: "+1222", To: "+1222", CommandAuthorized: true },
+        {
+          Body: "/model kimi-k2-0905-preview",
+          From: "+1222",
+          To: "+1222",
+          CommandAuthorized: true,
+        },
        {},
        {
          agents: {
--- a/src/gateway/ws-log.ts
+++ b/src/gateway/ws-log.ts
@@ -67,7 +67,9 @@ export function formatForLog(value: unknown): string {
        : JSON.stringify(value);
    if (!str) return "";
    const redacted = redactSensitiveText(str, WS_LOG_REDACT_OPTIONS);
-    return redacted.length > LOG_VALUE_LIMIT ? `${redacted.slice(0, LOG_VALUE_LIMIT)}...` : redacted;
+    return redacted.length > LOG_VALUE_LIMIT
+      ? `${redacted.slice(0, LOG_VALUE_LIMIT)}...`
+      : redacted;
  } catch {
    return String(value);
  }
--- a/src/media-understanding/providers/deepgram/audio.live.test.ts
+++ b/src/media-understanding/providers/deepgram/audio.live.test.ts
@@ -31,21 +31,17 @@ async function fetchSampleBuffer(url: string, timeoutMs: number): Promise<Buffer
 }

 describeLive("deepgram live", () => {
-  it(
-    "transcribes sample audio",
-    async () => {
-      const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000);
-      const result = await transcribeDeepgramAudio({
-        buffer,
-        fileName: "sample.wav",
-        mime: "audio/wav",
-        apiKey: DEEPGRAM_KEY,
-        model: DEEPGRAM_MODEL,
-        baseUrl: DEEPGRAM_BASE_URL,
-        timeoutMs: 20000,
-      });
-      expect(result.text.trim().length).toBeGreaterThan(0);
-    },
-    30000,
-  );
+  it("transcribes sample audio", async () => {
+    const buffer = await fetchSampleBuffer(SAMPLE_URL, 15000);
+    const result = await transcribeDeepgramAudio({
+      buffer,
+      fileName: "sample.wav",
+      mime: "audio/wav",
+      apiKey: DEEPGRAM_KEY,
+      model: DEEPGRAM_MODEL,
+      baseUrl: DEEPGRAM_BASE_URL,
+      timeoutMs: 20000,
+    });
+    expect(result.text.trim().length).toBeGreaterThan(0);
+  }, 30000);
 });
--- a/src/media-understanding/providers/deepgram/audio.test.ts
+++ b/src/media-understanding/providers/deepgram/audio.test.ts
@@ -84,6 +84,6 @@ describe("transcribeDeepgramAudio", () => {
    expect(headers.get("authorization")).toBe("Token test-key");
    expect(headers.get("x-custom")).toBe("1");
    expect(headers.get("content-type")).toBe("audio/wav");
-    expect(Buffer.isBuffer(seenInit?.body)).toBe(true);
+    expect(seenInit?.body).toBeInstanceOf(Uint8Array);
  });
 });
--- a/src/media-understanding/runner.ts
+++ b/src/media-understanding/runner.ts
@@ -293,9 +293,9 @@ async function runProviderEntry(params: {
    const providerConfig = cfg.models?.providers?.[providerId];
    const baseUrl = entry.baseUrl ?? params.config?.baseUrl ?? providerConfig?.baseUrl;
    const mergedHeaders = {
-      ...(providerConfig?.headers ?? {}),
-      ...(params.config?.headers ?? {}),
-      ...(entry.headers ?? {}),
+      ...providerConfig?.headers,
+      ...params.config?.headers,
+      ...entry.headers,
    };
    const headers = Object.keys(mergedHeaders).length > 0 ? mergedHeaders : undefined;
    const providerQuery = resolveProviderQuery({
--- a/src/web/auto-reply/monitor/process-message.ts
+++ b/src/web/auto-reply/monitor/process-message.ts
@@ -16,7 +16,10 @@ import {
 import { dispatchReplyWithBufferedBlockDispatcher } from "../../../auto-reply/reply/provider-dispatcher.js";
 import type { getReplyFromConfig } from "../../../auto-reply/reply.js";
 import type { ReplyPayload } from "../../../auto-reply/types.js";
-import { hasInlineCommandTokens, isControlCommandMessage } from "../../../auto-reply/command-detection.js";
+import {
+  hasInlineCommandTokens,
+  isControlCommandMessage,
+} from "../../../auto-reply/command-detection.js";
 import { finalizeInboundContext } from "../../../auto-reply/reply/inbound-context.js";
 import { toLocationContext } from "../../../channels/location.js";
 import type { loadConfig } from "../../../config/config.js";