From 93bef830ce091187e6d047c298e8b9245dcae724 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 23 Jan 2026 05:47:13 +0000 Subject: [PATCH] test: add media auto-detect coverage --- .../providers/openai/audio.test.ts | 4 +- .../runner.auto-audio.test.ts | 8 +- test/media-understanding.auto.e2e.test.ts | 169 ++++++++++++++++++ 3 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 test/media-understanding.auto.e2e.test.ts diff --git a/src/media-understanding/providers/openai/audio.test.ts b/src/media-understanding/providers/openai/audio.test.ts index 88c713f2a..323c394ae 100644 --- a/src/media-understanding/providers/openai/audio.test.ts +++ b/src/media-understanding/providers/openai/audio.test.ts @@ -59,7 +59,7 @@ describe("transcribeOpenAiCompatibleAudio", () => { fetchFn, }); - expect(result.model).toBe("whisper-1"); + expect(result.model).toBe("gpt-4o-mini-transcribe"); expect(result.text).toBe("hello"); expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions"); expect(seenInit?.method).toBe("POST"); @@ -71,7 +71,7 @@ describe("transcribeOpenAiCompatibleAudio", () => { const form = seenInit?.body as FormData; expect(form).toBeInstanceOf(FormData); - expect(form.get("model")).toBe("whisper-1"); + expect(form.get("model")).toBe("gpt-4o-mini-transcribe"); expect(form.get("language")).toBe("en"); expect(form.get("prompt")).toBe("hello"); const file = form.get("file") as Blob | { type?: string; name?: string } | null; diff --git a/src/media-understanding/runner.auto-audio.test.ts b/src/media-understanding/runner.auto-audio.test.ts index 68d9b665e..4e226f6ec 100644 --- a/src/media-understanding/runner.auto-audio.test.ts +++ b/src/media-understanding/runner.auto-audio.test.ts @@ -15,6 +15,8 @@ import { describe("runCapability auto audio entries", () => { it("uses provider keys to auto-enable audio transcription", async () => { + const originalPath = process.env.PATH; + process.env.PATH = "/usr/bin:/bin"; const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`); await fs.writeFile(tmpPath, Buffer.from("RIFF")); const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; @@ -54,15 +56,18 @@ describe("runCapability auto audio entries", () => { providerRegistry, }); expect(result.outputs[0]?.text).toBe("ok"); - expect(seenModel).toBe("whisper-1"); + expect(seenModel).toBe("gpt-4o-mini-transcribe"); expect(result.decision.outcome).toBe("success"); } finally { + process.env.PATH = originalPath; await cache.cleanup(); await fs.unlink(tmpPath).catch(() => {}); } }); it("skips auto audio when disabled", async () => { + const originalPath = process.env.PATH; + process.env.PATH = "/usr/bin:/bin"; const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`); await fs.writeFile(tmpPath, Buffer.from("RIFF")); const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" }; @@ -107,6 +112,7 @@ describe("runCapability auto audio entries", () => { expect(result.outputs).toHaveLength(0); expect(result.decision.outcome).toBe("disabled"); } finally { + process.env.PATH = originalPath; await cache.cleanup(); await fs.unlink(tmpPath).catch(() => {}); } diff --git a/test/media-understanding.auto.e2e.test.ts b/test/media-understanding.auto.e2e.test.ts new file mode 100644 index 000000000..cdc9a0c8e --- /dev/null +++ b/test/media-understanding.auto.e2e.test.ts @@ -0,0 +1,169 @@ +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; + +import { afterEach, describe, expect, it, vi } from "vitest"; + +import type { ClawdbotConfig } from "../src/config/config.js"; +import type { MsgContext } from "../src/auto-reply/templating.js"; + +const makeTempDir = async (prefix: string) => await fs.mkdtemp(path.join(os.tmpdir(), prefix)); + +const writeExecutable = async (dir: string, name: string, content: string) => { + const filePath = path.join(dir, name); + await fs.writeFile(filePath, content, { mode: 0o755 }); + return filePath; +}; + +const makeTempMedia = async (ext: string) => { + const dir = await makeTempDir("clawdbot-media-e2e-"); + const filePath = path.join(dir, `sample${ext}`); + await fs.writeFile(filePath, "audio"); + return { dir, filePath }; +}; + +const loadApply = async () => { + vi.resetModules(); + return await import("../src/media-understanding/apply.js"); +}; + +const envSnapshot = () => ({ + PATH: process.env.PATH, + SHERPA_ONNX_MODEL_DIR: process.env.SHERPA_ONNX_MODEL_DIR, + WHISPER_CPP_MODEL: process.env.WHISPER_CPP_MODEL, +}); + +const restoreEnv = (snapshot: ReturnType) => { + process.env.PATH = snapshot.PATH; + process.env.SHERPA_ONNX_MODEL_DIR = snapshot.SHERPA_ONNX_MODEL_DIR; + process.env.WHISPER_CPP_MODEL = snapshot.WHISPER_CPP_MODEL; +}; + +describe("media understanding auto-detect (e2e)", () => { + let tempPaths: string[] = []; + + afterEach(async () => { + for (const p of tempPaths) { + await fs.rm(p, { recursive: true, force: true }).catch(() => {}); + } + tempPaths = []; + }); + + it("uses sherpa-onnx-offline when available", async () => { + const snapshot = envSnapshot(); + try { + const binDir = await makeTempDir("clawdbot-bin-sherpa-"); + const modelDir = await makeTempDir("clawdbot-sherpa-model-"); + tempPaths.push(binDir, modelDir); + + await fs.writeFile(path.join(modelDir, "tokens.txt"), "a"); + await fs.writeFile(path.join(modelDir, "encoder.onnx"), "a"); + await fs.writeFile(path.join(modelDir, "decoder.onnx"), "a"); + await fs.writeFile(path.join(modelDir, "joiner.onnx"), "a"); + + await writeExecutable( + binDir, + "sherpa-onnx-offline", + "#!/usr/bin/env bash\n" + 'echo "{\\"text\\":\\"sherpa ok\\"}"\n', + ); + + process.env.PATH = `${binDir}:/usr/bin:/bin`; + process.env.SHERPA_ONNX_MODEL_DIR = modelDir; + + const { filePath } = await makeTempMedia(".wav"); + tempPaths.push(path.dirname(filePath)); + + const { applyMediaUnderstanding } = await loadApply(); + const ctx: MsgContext = { + Body: "", + MediaPath: filePath, + MediaType: "audio/wav", + }; + const cfg: ClawdbotConfig = { tools: { media: { audio: {} } } }; + + await applyMediaUnderstanding({ ctx, cfg }); + + expect(ctx.Transcript).toBe("sherpa ok"); + } finally { + restoreEnv(snapshot); + } + }); + + it("uses whisper-cli when sherpa is missing", async () => { + const snapshot = envSnapshot(); + try { + const binDir = await makeTempDir("clawdbot-bin-whispercpp-"); + const modelDir = await makeTempDir("clawdbot-whispercpp-model-"); + tempPaths.push(binDir, modelDir); + + const modelPath = path.join(modelDir, "tiny.bin"); + await fs.writeFile(modelPath, "model"); + + await writeExecutable( + binDir, + "whisper-cli", + "#!/usr/bin/env bash\n" + + 'out=""\n' + + 'prev=""\n' + + 'for arg in "$@"; do\n' + + ' if [ "$prev" = "-of" ]; then out="$arg"; break; fi\n' + + ' prev="$arg"\n' + + "done\n" + + 'if [ -n "$out" ]; then echo \'whisper cpp ok\' > "${out}.txt"; fi\n', + ); + + process.env.PATH = `${binDir}:/usr/bin:/bin`; + process.env.WHISPER_CPP_MODEL = modelPath; + + const { filePath } = await makeTempMedia(".wav"); + tempPaths.push(path.dirname(filePath)); + + const { applyMediaUnderstanding } = await loadApply(); + const ctx: MsgContext = { + Body: "", + MediaPath: filePath, + MediaType: "audio/wav", + }; + const cfg: ClawdbotConfig = { tools: { media: { audio: {} } } }; + + await applyMediaUnderstanding({ ctx, cfg }); + + expect(ctx.Transcript).toBe("whisper cpp ok"); + } finally { + restoreEnv(snapshot); + } + }); + + it("uses gemini CLI for images when available", async () => { + const snapshot = envSnapshot(); + try { + const binDir = await makeTempDir("clawdbot-bin-gemini-"); + tempPaths.push(binDir); + + await writeExecutable( + binDir, + "gemini", + "#!/usr/bin/env bash\necho '{" + '\\"response\\":\\"gemini ok\\"' + "}'\n", + ); + + process.env.PATH = `${binDir}:/usr/bin:/bin`; + + const { filePath } = await makeTempMedia(".png"); + tempPaths.push(path.dirname(filePath)); + + const { applyMediaUnderstanding } = await loadApply(); + const ctx: MsgContext = { + Body: "", + MediaPath: filePath, + MediaType: "image/png", + }; + const cfg: ClawdbotConfig = { tools: { media: { image: {} } } }; + + await applyMediaUnderstanding({ ctx, cfg }); + + expect(ctx.Body).toContain("gemini ok"); + } finally { + restoreEnv(snapshot); + } + }); +});