test: add media auto-detect coverage

This commit is contained in:
Peter Steinberger
2026-01-23 05:47:13 +00:00
parent 2dfbd1c1f6
commit 93bef830ce
3 changed files with 178 additions and 3 deletions

View File

@@ -59,7 +59,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
fetchFn,
});
expect(result.model).toBe("whisper-1");
expect(result.model).toBe("gpt-4o-mini-transcribe");
expect(result.text).toBe("hello");
expect(seenUrl).toBe("https://api.example.com/v1/audio/transcriptions");
expect(seenInit?.method).toBe("POST");
@@ -71,7 +71,7 @@ describe("transcribeOpenAiCompatibleAudio", () => {
const form = seenInit?.body as FormData;
expect(form).toBeInstanceOf(FormData);
expect(form.get("model")).toBe("whisper-1");
expect(form.get("model")).toBe("gpt-4o-mini-transcribe");
expect(form.get("language")).toBe("en");
expect(form.get("prompt")).toBe("hello");
const file = form.get("file") as Blob | { type?: string; name?: string } | null;

View File

@@ -15,6 +15,8 @@ import {
describe("runCapability auto audio entries", () => {
it("uses provider keys to auto-enable audio transcription", async () => {
const originalPath = process.env.PATH;
process.env.PATH = "/usr/bin:/bin";
const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
@@ -54,15 +56,18 @@ describe("runCapability auto audio entries", () => {
providerRegistry,
});
expect(result.outputs[0]?.text).toBe("ok");
expect(seenModel).toBe("whisper-1");
expect(seenModel).toBe("gpt-4o-mini-transcribe");
expect(result.decision.outcome).toBe("success");
} finally {
process.env.PATH = originalPath;
await cache.cleanup();
await fs.unlink(tmpPath).catch(() => {});
}
});
it("skips auto audio when disabled", async () => {
const originalPath = process.env.PATH;
process.env.PATH = "/usr/bin:/bin";
const tmpPath = path.join(os.tmpdir(), `clawdbot-auto-audio-${Date.now()}.wav`);
await fs.writeFile(tmpPath, Buffer.from("RIFF"));
const ctx: MsgContext = { MediaPath: tmpPath, MediaType: "audio/wav" };
@@ -107,6 +112,7 @@ describe("runCapability auto audio entries", () => {
expect(result.outputs).toHaveLength(0);
expect(result.decision.outcome).toBe("disabled");
} finally {
process.env.PATH = originalPath;
await cache.cleanup();
await fs.unlink(tmpPath).catch(() => {});
}

View File

@@ -0,0 +1,169 @@
import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";
import { afterEach, describe, expect, it, vi } from "vitest";
import type { ClawdbotConfig } from "../src/config/config.js";
import type { MsgContext } from "../src/auto-reply/templating.js";
const makeTempDir = async (prefix: string) => await fs.mkdtemp(path.join(os.tmpdir(), prefix));
const writeExecutable = async (dir: string, name: string, content: string) => {
const filePath = path.join(dir, name);
await fs.writeFile(filePath, content, { mode: 0o755 });
return filePath;
};
const makeTempMedia = async (ext: string) => {
const dir = await makeTempDir("clawdbot-media-e2e-");
const filePath = path.join(dir, `sample${ext}`);
await fs.writeFile(filePath, "audio");
return { dir, filePath };
};
const loadApply = async () => {
vi.resetModules();
return await import("../src/media-understanding/apply.js");
};
const envSnapshot = () => ({
PATH: process.env.PATH,
SHERPA_ONNX_MODEL_DIR: process.env.SHERPA_ONNX_MODEL_DIR,
WHISPER_CPP_MODEL: process.env.WHISPER_CPP_MODEL,
});
const restoreEnv = (snapshot: ReturnType<typeof envSnapshot>) => {
process.env.PATH = snapshot.PATH;
process.env.SHERPA_ONNX_MODEL_DIR = snapshot.SHERPA_ONNX_MODEL_DIR;
process.env.WHISPER_CPP_MODEL = snapshot.WHISPER_CPP_MODEL;
};
describe("media understanding auto-detect (e2e)", () => {
let tempPaths: string[] = [];
afterEach(async () => {
for (const p of tempPaths) {
await fs.rm(p, { recursive: true, force: true }).catch(() => {});
}
tempPaths = [];
});
it("uses sherpa-onnx-offline when available", async () => {
const snapshot = envSnapshot();
try {
const binDir = await makeTempDir("clawdbot-bin-sherpa-");
const modelDir = await makeTempDir("clawdbot-sherpa-model-");
tempPaths.push(binDir, modelDir);
await fs.writeFile(path.join(modelDir, "tokens.txt"), "a");
await fs.writeFile(path.join(modelDir, "encoder.onnx"), "a");
await fs.writeFile(path.join(modelDir, "decoder.onnx"), "a");
await fs.writeFile(path.join(modelDir, "joiner.onnx"), "a");
await writeExecutable(
binDir,
"sherpa-onnx-offline",
"#!/usr/bin/env bash\n" + 'echo "{\\"text\\":\\"sherpa ok\\"}"\n',
);
process.env.PATH = `${binDir}:/usr/bin:/bin`;
process.env.SHERPA_ONNX_MODEL_DIR = modelDir;
const { filePath } = await makeTempMedia(".wav");
tempPaths.push(path.dirname(filePath));
const { applyMediaUnderstanding } = await loadApply();
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: filePath,
MediaType: "audio/wav",
};
const cfg: ClawdbotConfig = { tools: { media: { audio: {} } } };
await applyMediaUnderstanding({ ctx, cfg });
expect(ctx.Transcript).toBe("sherpa ok");
} finally {
restoreEnv(snapshot);
}
});
it("uses whisper-cli when sherpa is missing", async () => {
const snapshot = envSnapshot();
try {
const binDir = await makeTempDir("clawdbot-bin-whispercpp-");
const modelDir = await makeTempDir("clawdbot-whispercpp-model-");
tempPaths.push(binDir, modelDir);
const modelPath = path.join(modelDir, "tiny.bin");
await fs.writeFile(modelPath, "model");
await writeExecutable(
binDir,
"whisper-cli",
"#!/usr/bin/env bash\n" +
'out=""\n' +
'prev=""\n' +
'for arg in "$@"; do\n' +
' if [ "$prev" = "-of" ]; then out="$arg"; break; fi\n' +
' prev="$arg"\n' +
"done\n" +
'if [ -n "$out" ]; then echo \'whisper cpp ok\' > "${out}.txt"; fi\n',
);
process.env.PATH = `${binDir}:/usr/bin:/bin`;
process.env.WHISPER_CPP_MODEL = modelPath;
const { filePath } = await makeTempMedia(".wav");
tempPaths.push(path.dirname(filePath));
const { applyMediaUnderstanding } = await loadApply();
const ctx: MsgContext = {
Body: "<media:audio>",
MediaPath: filePath,
MediaType: "audio/wav",
};
const cfg: ClawdbotConfig = { tools: { media: { audio: {} } } };
await applyMediaUnderstanding({ ctx, cfg });
expect(ctx.Transcript).toBe("whisper cpp ok");
} finally {
restoreEnv(snapshot);
}
});
it("uses gemini CLI for images when available", async () => {
const snapshot = envSnapshot();
try {
const binDir = await makeTempDir("clawdbot-bin-gemini-");
tempPaths.push(binDir);
await writeExecutable(
binDir,
"gemini",
"#!/usr/bin/env bash\necho '{" + '\\"response\\":\\"gemini ok\\"' + "}'\n",
);
process.env.PATH = `${binDir}:/usr/bin:/bin`;
const { filePath } = await makeTempMedia(".png");
tempPaths.push(path.dirname(filePath));
const { applyMediaUnderstanding } = await loadApply();
const ctx: MsgContext = {
Body: "<media:image>",
MediaPath: filePath,
MediaType: "image/png",
};
const cfg: ClawdbotConfig = { tools: { media: { image: {} } } };
await applyMediaUnderstanding({ ctx, cfg });
expect(ctx.Body).toContain("gemini ok");
} finally {
restoreEnv(snapshot);
}
});
});