From 93bef830ce091187e6d047c298e8b9245dcae724 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Fri, 23 Jan 2026 05:47:13 +0000
Subject: [PATCH] test: add media auto-detect coverage

---
 .../providers/openai/audio.test.ts            |   4 +-
 .../runner.auto-audio.test.ts                 |   8 +-
 test/media-understanding.auto.e2e.test.ts     | 169 ++++++++++++++++++
 3 files changed, 178 insertions(+), 3 deletions(-)
 create mode 100644 test/media-understanding.auto.e2e.test.ts

diff --git a/src/media-understanding/providers/openai/audio.test.ts b/src/media-understanding/providers/openai/audio.test.ts
index 88c713f2a..323c394ae 100644
--- a/src/media-understanding/providers/openai/audio.test.ts
+++ b/src/media-understanding/providers/openai/audio.test.ts
@@ -59,7 +59,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
       fetchFn,
     });
 
-    expect(result.model).toBe("whisper-1");
+    expect(result.model).toBe("gpt-4o-mini-transcribe");
     expect(result.text).toBe("hello");
     expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
     expect(seenInit?.method).toBe("POST");
@@ -71,7 +71,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
 
     const form = seenInit?.body as FormData;
     expect(form).toBeInstanceOf(FormData);
-    expect(form.get("model")).toBe("whisper-1");
+    expect(form.get("model")).toBe("gpt-4o-mini-transcribe");
     expect(form.get("language")).toBe("en");
     expect(form.get("prompt")).toBe("hello");
     const file = form.get("file") as Blob | { type?: string; name?: string } | null;
diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts
index 68d9b665e..4e226f6ec 100644
--- a/src/media-understanding/runner.auto-audio.test.ts
+++ b/src/media-understanding/runner.auto-audio.test.ts
@@ -15,6 +15,8 @@ import {
 
 describe("runCapability auto audio entries", () => {
   it("uses provider keys to auto-enable audio transcription", async () => {
+    const originalPath = process.env.PATH;
+    process.env.PATH = "/usr/bin:/bin";
     const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
     await fs.writeFile(tmpPath, Buffer.from("RIFF"));
     const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
@@ -54,15 +56,18 @@ describe("runCapability auto audio entries", () => {
         providerRegistry,
       });
       expect(result.outputs[0]?.text).toBe("ok");
-      expect(seenModel).toBe("whisper-1");
+      expect(seenModel).toBe("gpt-4o-mini-transcribe");
       expect(result.decision.outcome).toBe("success");
     } finally {
+      process.env.PATH = originalPath;
       await cache.cleanup();
       await fs.unlink(tmpPath).catch(() => {});
     }
   });
 
   it("skips auto audio when disabled", async () => {
+    const originalPath = process.env.PATH;
+    process.env.PATH = "/usr/bin:/bin";
     const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
     await fs.writeFile(tmpPath, Buffer.from("RIFF"));
     const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
@@ -107,6 +112,7 @@ describe("runCapability auto audio entries", () => {
       expect(result.outputs).toHaveLength(0);
       expect(result.decision.outcome).toBe("disabled");
     } finally {
+      process.env.PATH = originalPath;
       await cache.cleanup();
       await fs.unlink(tmpPath).catch(() => {});
     }
diff --git a/test/media-understanding.auto.e2e.test.ts b/test/media-understanding.auto.e2e.test.ts
new file mode 100644
index 000000000..cdc9a0c8e
--- /dev/null
+++ b/test/media-understanding.auto.e2e.test.ts
@@ -0,0 +1,169 @@
+import fs from "node:fs/promises";
+import os from "node:os";
+import path from "node:path";
+
+import { afterEach, describe, expect, it, vi } from "vitest";
+
+import type { ClawdbotConfig } from "../src/config/config.js";
+import type { MsgContext } from "../src/auto-reply/templating.js";
+
+const makeTempDir = async (prefix: string) => await fs.mkdtemp(path.join(os.tmpdir(), prefix));
+
+const writeExecutable = async (dir: string, name: string, content: string) => {
+  const filePath = path.join(dir, name);
+  await fs.writeFile(filePath, content, { mode: 0o755 });
+  return filePath;
+};
+
+const makeTempMedia = async (ext: string) => {
+  const dir = await makeTempDir("clawdbot-media-e2e-");
+  const filePath = path.join(dir, `sample${ext}`);
+  await fs.writeFile(filePath, "audio");
+  return { dir, filePath };
+};
+
+const loadApply = async () => {
+  vi.resetModules();
+  return await import("../src/media-understanding/apply.js");
+};
+
+const envSnapshot = () => ({
+  PATH: process.env.PATH,
+  SHERPA_ONNX_MODEL_DIR: process.env.SHERPA_ONNX_MODEL_DIR,
+  WHISPER_CPP_MODEL: process.env.WHISPER_CPP_MODEL,
+});
+
+const restoreEnv = (snapshot: ReturnType<typeof envSnapshot>) => {
+  process.env.PATH = snapshot.PATH;
+  process.env.SHERPA_ONNX_MODEL_DIR = snapshot.SHERPA_ONNX_MODEL_DIR;
+  process.env.WHISPER_CPP_MODEL = snapshot.WHISPER_CPP_MODEL;
+};
+
+describe("media understanding auto-detect (e2e)", () => {
+  let tempPaths: string[] = [];
+
+  afterEach(async () => {
+    for (const p of tempPaths) {
+      await fs.rm(p, { recursive: true, force: true }).catch(() => {});
+    }
+    tempPaths = [];
+  });
+
+  it("uses sherpa-onnx-offline when available", async () => {
+    const snapshot = envSnapshot();
+    try {
+      const binDir = await makeTempDir("clawdbot-bin-sherpa-");
+      const modelDir = await makeTempDir("clawdbot-sherpa-model-");
+      tempPaths.push(binDir, modelDir);
+
+      await fs.writeFile(path.join(modelDir, "tokens.txt"), "a");
+      await fs.writeFile(path.join(modelDir, "encoder.onnx"), "a");
+      await fs.writeFile(path.join(modelDir, "decoder.onnx"), "a");
+      await fs.writeFile(path.join(modelDir, "joiner.onnx"), "a");
+
+      await writeExecutable(
+        binDir,
+        "sherpa-onnx-offline",
+        "#!/usr/bin/env bash\n" + 'echo "{\\"text\\":\\"sherpa ok\\"}"\n',
+      );
+
+      process.env.PATH = `${binDir}:/usr/bin:/bin`;
+      process.env.SHERPA_ONNX_MODEL_DIR = modelDir;
+
+      const { filePath } = await makeTempMedia(".wav");
+      tempPaths.push(path.dirname(filePath));
+
+      const { applyMediaUnderstanding } = await loadApply();
+      const ctx: MsgContext = {
+        Body: "<media:audio>",
+        MediaPath: filePath,
+        MediaType: "audio/wav",
+      };
+      const cfg: ClawdbotConfig = { tools: { media: { audio: {} } } };
+
+      await applyMediaUnderstanding({ ctx, cfg });
+
+      expect(ctx.Transcript).toBe("sherpa ok");
+    } finally {
+      restoreEnv(snapshot);
+    }
+  });
+
+  it("uses whisper-cli when sherpa is missing", async () => {
+    const snapshot = envSnapshot();
+    try {
+      const binDir = await makeTempDir("clawdbot-bin-whispercpp-");
+      const modelDir = await makeTempDir("clawdbot-whispercpp-model-");
+      tempPaths.push(binDir, modelDir);
+
+      const modelPath = path.join(modelDir, "tiny.bin");
+      await fs.writeFile(modelPath, "model");
+
+      await writeExecutable(
+        binDir,
+        "whisper-cli",
+        "#!/usr/bin/env bash\n" +
+          'out=""\n' +
+          'prev=""\n' +
+          'for arg in "$@"; do\n' +
+          '  if [ "$prev" = "-of" ]; then out="$arg"; break; fi\n' +
+          '  prev="$arg"\n' +
+          "done\n" +
+          'if [ -n "$out" ]; then echo \'whisper cpp ok\' > "${out}.txt"; fi\n',
+      );
+
+      process.env.PATH = `${binDir}:/usr/bin:/bin`;
+      process.env.WHISPER_CPP_MODEL = modelPath;
+
+      const { filePath } = await makeTempMedia(".wav");
+      tempPaths.push(path.dirname(filePath));
+
+      const { applyMediaUnderstanding } = await loadApply();
+      const ctx: MsgContext = {
+        Body: "<media:audio>",
+        MediaPath: filePath,
+        MediaType: "audio/wav",
+      };
+      const cfg: ClawdbotConfig = { tools: { media: { audio: {} } } };
+
+      await applyMediaUnderstanding({ ctx, cfg });
+
+      expect(ctx.Transcript).toBe("whisper cpp ok");
+    } finally {
+      restoreEnv(snapshot);
+    }
+  });
+
+  it("uses gemini CLI for images when available", async () => {
+    const snapshot = envSnapshot();
+    try {
+      const binDir = await makeTempDir("clawdbot-bin-gemini-");
+      tempPaths.push(binDir);
+
+      await writeExecutable(
+        binDir,
+        "gemini",
+        "#!/usr/bin/env bash\necho '{" + '\\"response\\":\\"gemini ok\\"' + "}'\n",
+      );
+
+      process.env.PATH = `${binDir}:/usr/bin:/bin`;
+
+      const { filePath } = await makeTempMedia(".png");
+      tempPaths.push(path.dirname(filePath));
+
+      const { applyMediaUnderstanding } = await loadApply();
+      const ctx: MsgContext = {
+        Body: "<media:image>",
+        MediaPath: filePath,
+        MediaType: "image/png",
+      };
+      const cfg: ClawdbotConfig = { tools: { media: { image: {} } } };
+
+      await applyMediaUnderstanding({ ctx, cfg });
+
+      expect(ctx.Body).toContain("gemini ok");
+    } finally {
+      restoreEnv(snapshot);
+    }
+  });
+});