fix: modernize live tests and gemini ids
This commit is contained in:
@@ -11,9 +11,15 @@ import {
|
||||
} from "@mariozechner/pi-coding-agent";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
|
||||
import {
|
||||
collectAnthropicApiKeys,
|
||||
isAnthropicRateLimitError,
|
||||
} from "../agents/live-auth-keys.js";
|
||||
import { isModernModelRef } from "../agents/live-model-filter.js";
|
||||
import { getApiKeyForModel } from "../agents/model-auth.js";
|
||||
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
|
||||
import { loadConfig } from "../config/config.js";
|
||||
import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js";
|
||||
import {
|
||||
GATEWAY_CLIENT_MODES,
|
||||
GATEWAY_CLIENT_NAMES,
|
||||
@@ -25,16 +31,14 @@ import { startGatewayServer } from "./server.js";
|
||||
|
||||
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
|
||||
const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1";
|
||||
const ALL_MODELS =
|
||||
process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" ||
|
||||
process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all";
|
||||
const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1";
|
||||
const EXTRA_IMAGE_PROBES =
|
||||
process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1";
|
||||
const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1";
|
||||
const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
|
||||
const THINKING_LEVEL = "high";
|
||||
const THINKING_TAG_RE =
|
||||
/<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
|
||||
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
|
||||
|
||||
const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip;
|
||||
const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
|
||||
|
||||
function parseFilter(raw?: string): Set<string> | null {
|
||||
const trimmed = raw?.trim();
|
||||
@@ -46,6 +50,26 @@ function parseFilter(raw?: string): Set<string> | null {
|
||||
return ids.length ? new Set(ids) : null;
|
||||
}
|
||||
|
||||
function logProgress(message: string): void {
|
||||
console.log(`[live] ${message}`);
|
||||
}
|
||||
|
||||
function assertNoReasoningTags(params: {
|
||||
text: string;
|
||||
model: string;
|
||||
phase: string;
|
||||
label: string;
|
||||
}): void {
|
||||
if (!params.text) return;
|
||||
if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) {
|
||||
const snippet =
|
||||
params.text.length > 200 ? `${params.text.slice(0, 200)}…` : params.text;
|
||||
throw new Error(
|
||||
`[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function extractPayloadText(result: unknown): string {
|
||||
const record = result as Record<string, unknown>;
|
||||
const payloads = Array.isArray(record.payloads) ? record.payloads : [];
|
||||
@@ -200,61 +224,470 @@ async function connectClient(params: { url: string; token: string }) {
|
||||
});
|
||||
}
|
||||
|
||||
type GatewayModelSuiteParams = {
|
||||
label: string;
|
||||
cfg: ClawdbotConfig;
|
||||
candidates: Array<Model<Api>>;
|
||||
extraToolProbes: boolean;
|
||||
extraImageProbes: boolean;
|
||||
thinkingLevel: string;
|
||||
providerOverrides?: Record<string, ModelProviderConfig>;
|
||||
};
|
||||
|
||||
function buildLiveGatewayConfig(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
candidates: Array<Model<Api>>;
|
||||
providerOverrides?: Record<string, ModelProviderConfig>;
|
||||
}): ClawdbotConfig {
|
||||
const lmstudioProvider = params.cfg.models?.providers?.lmstudio;
|
||||
const baseProviders = params.cfg.models?.providers ?? {};
|
||||
const nextProviders = {
|
||||
...baseProviders,
|
||||
...(lmstudioProvider
|
||||
? {
|
||||
lmstudio: {
|
||||
...lmstudioProvider,
|
||||
api: "openai-completions",
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
...(params.providerOverrides ?? {}),
|
||||
};
|
||||
const providers =
|
||||
Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders;
|
||||
return {
|
||||
...params.cfg,
|
||||
agents: {
|
||||
...params.cfg.agents,
|
||||
list: (params.cfg.agents?.list ?? []).map((entry) => ({
|
||||
...entry,
|
||||
sandbox: { mode: "off" },
|
||||
})),
|
||||
defaults: {
|
||||
...params.cfg.agents?.defaults,
|
||||
// Live tests should avoid Docker sandboxing so tool probes can
|
||||
// operate on the temporary probe files we create in the host workspace.
|
||||
sandbox: { mode: "off" },
|
||||
models: Object.fromEntries(
|
||||
params.candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
|
||||
),
|
||||
},
|
||||
},
|
||||
models:
|
||||
Object.keys(providers).length > 0
|
||||
? { ...params.cfg.models, providers }
|
||||
: params.cfg.models,
|
||||
};
|
||||
}
|
||||
|
||||
function buildMinimaxProviderOverride(params: {
|
||||
cfg: ClawdbotConfig;
|
||||
api: "openai-completions" | "anthropic-messages";
|
||||
baseUrl: string;
|
||||
}): ModelProviderConfig | null {
|
||||
const existing = params.cfg.models?.providers?.minimax;
|
||||
if (!existing || !Array.isArray(existing.models) || existing.models.length === 0)
|
||||
return null;
|
||||
return {
|
||||
...existing,
|
||||
api: params.api,
|
||||
baseUrl: params.baseUrl,
|
||||
};
|
||||
}
|
||||
|
||||
async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
||||
const previous = {
|
||||
configPath: process.env.CLAWDBOT_CONFIG_PATH,
|
||||
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
|
||||
skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
|
||||
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
|
||||
skipCron: process.env.CLAWDBOT_SKIP_CRON,
|
||||
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
|
||||
};
|
||||
|
||||
process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
|
||||
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
|
||||
process.env.CLAWDBOT_SKIP_CRON = "1";
|
||||
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
|
||||
|
||||
const token = `test-${randomUUID()}`;
|
||||
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
|
||||
|
||||
const workspaceDir = resolveUserPath(
|
||||
params.cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
|
||||
);
|
||||
await fs.mkdir(workspaceDir, { recursive: true });
|
||||
const nonceA = randomUUID();
|
||||
const nonceB = randomUUID();
|
||||
const toolProbePath = path.join(
|
||||
workspaceDir,
|
||||
`.clawdbot-live-tool-probe.${nonceA}.txt`,
|
||||
);
|
||||
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
|
||||
|
||||
const nextCfg = buildLiveGatewayConfig({
|
||||
cfg: params.cfg,
|
||||
candidates: params.candidates,
|
||||
providerOverrides: params.providerOverrides,
|
||||
});
|
||||
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-"));
|
||||
const tempConfigPath = path.join(tempDir, "clawdbot.json");
|
||||
await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`);
|
||||
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
|
||||
|
||||
await ensureClawdbotModelsJson(nextCfg);
|
||||
|
||||
const port = await getFreeGatewayPort();
|
||||
const server = await startGatewayServer(port, {
|
||||
bind: "loopback",
|
||||
auth: { mode: "token", token },
|
||||
controlUiEnabled: false,
|
||||
});
|
||||
|
||||
const client = await connectClient({
|
||||
url: `ws://127.0.0.1:${port}`,
|
||||
token,
|
||||
});
|
||||
|
||||
try {
|
||||
logProgress(
|
||||
`[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`,
|
||||
);
|
||||
const anthropicKeys = collectAnthropicApiKeys();
|
||||
if (anthropicKeys.length > 0) {
|
||||
process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
|
||||
logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`);
|
||||
}
|
||||
const sessionKey = `agent:dev:${params.label}`;
|
||||
const failures: Array<{ model: string; error: string }> = [];
|
||||
const total = params.candidates.length;
|
||||
|
||||
for (const [index, model] of params.candidates.entries()) {
|
||||
const modelKey = `${model.provider}/${model.id}`;
|
||||
const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`;
|
||||
|
||||
const attemptMax =
|
||||
model.provider === "anthropic" && anthropicKeys.length > 0
|
||||
? anthropicKeys.length
|
||||
: 1;
|
||||
|
||||
for (let attempt = 0; attempt < attemptMax; attempt += 1) {
|
||||
if (model.provider === "anthropic" && anthropicKeys.length > 0) {
|
||||
process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
|
||||
}
|
||||
try {
|
||||
// Ensure session exists + override model for this run.
|
||||
await client.request<Record<string, unknown>>("sessions.patch", {
|
||||
key: sessionKey,
|
||||
model: modelKey,
|
||||
});
|
||||
// Reset between models: avoids cross-provider transcript incompatibilities
|
||||
// (notably OpenAI Responses requiring reasoning replay for function_call items).
|
||||
await client.request<Record<string, unknown>>("sessions.reset", {
|
||||
key: sessionKey,
|
||||
});
|
||||
|
||||
logProgress(`${progressLabel}: prompt`);
|
||||
const runId = randomUUID();
|
||||
const payload = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId}`,
|
||||
message:
|
||||
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
|
||||
if (payload?.status !== "ok") {
|
||||
throw new Error(`agent status=${String(payload?.status)}`);
|
||||
}
|
||||
const text = extractPayloadText(payload?.result);
|
||||
if (model.provider === "google" && isGoogleModelNotFoundText(text)) {
|
||||
// Catalog drift: model IDs can disappear or become unavailable on the API.
|
||||
// Treat as skip when scanning "all models" for Google.
|
||||
logProgress(`${progressLabel}: skip (google model not found)`);
|
||||
break;
|
||||
}
|
||||
assertNoReasoningTags({
|
||||
text,
|
||||
model: modelKey,
|
||||
phase: "prompt",
|
||||
label: params.label,
|
||||
});
|
||||
if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
|
||||
if (
|
||||
!/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
|
||||
!/\bmacro\s*-?\s*tasks?\b/i.test(text)
|
||||
) {
|
||||
throw new Error(`missing required keywords: ${text}`);
|
||||
}
|
||||
|
||||
// Real tool invocation: force the agent to Read a local file and echo a nonce.
|
||||
logProgress(`${progressLabel}: tool-read`);
|
||||
const runIdTool = randomUUID();
|
||||
const toolProbe = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdTool}-tool`,
|
||||
message:
|
||||
"Clawdbot live tool probe (local, safe): " +
|
||||
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||
"Then reply with the two nonce values you read (include both).",
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (toolProbe?.status !== "ok") {
|
||||
throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
|
||||
}
|
||||
const toolText = extractPayloadText(toolProbe?.result);
|
||||
assertNoReasoningTags({
|
||||
text: toolText,
|
||||
model: modelKey,
|
||||
phase: "tool-read",
|
||||
label: params.label,
|
||||
});
|
||||
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
||||
throw new Error(`tool probe missing nonce: ${toolText}`);
|
||||
}
|
||||
|
||||
if (params.extraToolProbes) {
|
||||
logProgress(`${progressLabel}: tool-exec`);
|
||||
const nonceC = randomUUID();
|
||||
const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
|
||||
|
||||
const execReadProbe = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdTool}-exec-read`,
|
||||
message:
|
||||
"Clawdbot live tool probe (local, safe): " +
|
||||
"use the tool named `exec` (or `Exec`) to run this command: " +
|
||||
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
|
||||
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
|
||||
"Finally reply including the nonce text you read back.",
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (execReadProbe?.status !== "ok") {
|
||||
throw new Error(
|
||||
`exec+read probe failed: status=${String(execReadProbe?.status)}`,
|
||||
);
|
||||
}
|
||||
const execReadText = extractPayloadText(execReadProbe?.result);
|
||||
assertNoReasoningTags({
|
||||
text: execReadText,
|
||||
model: modelKey,
|
||||
phase: "tool-exec",
|
||||
label: params.label,
|
||||
});
|
||||
if (!execReadText.includes(nonceC)) {
|
||||
throw new Error(`exec+read probe missing nonce: ${execReadText}`);
|
||||
}
|
||||
|
||||
await fs.rm(toolWritePath, { force: true });
|
||||
}
|
||||
|
||||
if (params.extraImageProbes && model.input?.includes("image")) {
|
||||
logProgress(`${progressLabel}: image`);
|
||||
const imageCode = randomImageProbeCode(10);
|
||||
const imageBase64 = renderCatNoncePngBase64(imageCode);
|
||||
const runIdImage = randomUUID();
|
||||
|
||||
const imageProbe = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdImage}-image`,
|
||||
message:
|
||||
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
|
||||
"(1) the animal shown or written in the image, lowercase; " +
|
||||
"(2) the code printed in the image, uppercase. No extra text.",
|
||||
attachments: [
|
||||
{
|
||||
mimeType: "image/png",
|
||||
fileName: `probe-${runIdImage}.png`,
|
||||
content: imageBase64,
|
||||
},
|
||||
],
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (imageProbe?.status !== "ok") {
|
||||
throw new Error(
|
||||
`image probe failed: status=${String(imageProbe?.status)}`,
|
||||
);
|
||||
}
|
||||
const imageText = extractPayloadText(imageProbe?.result);
|
||||
assertNoReasoningTags({
|
||||
text: imageText,
|
||||
model: modelKey,
|
||||
phase: "image",
|
||||
label: params.label,
|
||||
});
|
||||
if (!/\bcat\b/i.test(imageText)) {
|
||||
throw new Error(`image probe missing 'cat': ${imageText}`);
|
||||
}
|
||||
const candidates =
|
||||
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
|
||||
const bestDistance = candidates.reduce((best, cand) => {
|
||||
if (Math.abs(cand.length - imageCode.length) > 2) return best;
|
||||
return Math.min(best, editDistance(cand, imageCode));
|
||||
}, Number.POSITIVE_INFINITY);
|
||||
if (!(bestDistance <= 2)) {
|
||||
throw new Error(
|
||||
`image probe missing code (${imageCode}): ${imageText}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
|
||||
if (
|
||||
(model.provider === "openai" && model.api === "openai-responses") ||
|
||||
(model.provider === "openai-codex" &&
|
||||
model.api === "openai-codex-responses")
|
||||
) {
|
||||
logProgress(`${progressLabel}: tool-only regression`);
|
||||
const runId2 = randomUUID();
|
||||
const first = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId2}-1`,
|
||||
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (first?.status !== "ok") {
|
||||
throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
|
||||
}
|
||||
const firstText = extractPayloadText(first?.result);
|
||||
assertNoReasoningTags({
|
||||
text: firstText,
|
||||
model: modelKey,
|
||||
phase: "tool-only",
|
||||
label: params.label,
|
||||
});
|
||||
|
||||
const second = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId2}-2`,
|
||||
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (second?.status !== "ok") {
|
||||
throw new Error(
|
||||
`post-tool message failed: status=${String(second?.status)}`,
|
||||
);
|
||||
}
|
||||
const reply = extractPayloadText(second?.result);
|
||||
assertNoReasoningTags({
|
||||
text: reply,
|
||||
model: modelKey,
|
||||
phase: "tool-only-followup",
|
||||
label: params.label,
|
||||
});
|
||||
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
|
||||
throw new Error(`unexpected reply: ${reply}`);
|
||||
}
|
||||
}
|
||||
|
||||
logProgress(`${progressLabel}: done`);
|
||||
break;
|
||||
} catch (err) {
|
||||
const message = String(err);
|
||||
if (
|
||||
model.provider === "anthropic" &&
|
||||
isAnthropicRateLimitError(message) &&
|
||||
attempt + 1 < attemptMax
|
||||
) {
|
||||
logProgress(`${progressLabel}: rate limit, retrying with next key`);
|
||||
continue;
|
||||
}
|
||||
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
|
||||
if (
|
||||
model.provider === "openai-codex" &&
|
||||
isRefreshTokenReused(message)
|
||||
) {
|
||||
logProgress(`${progressLabel}: skip (codex refresh token reused)`);
|
||||
break;
|
||||
}
|
||||
logProgress(`${progressLabel}: failed`);
|
||||
failures.push({ model: modelKey, error: message });
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (failures.length > 0) {
|
||||
const preview = failures
|
||||
.slice(0, 20)
|
||||
.map((f) => `- ${f.model}: ${f.error}`)
|
||||
.join("\n");
|
||||
throw new Error(
|
||||
`gateway live model failures (${failures.length}):\n${preview}`,
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
client.stop();
|
||||
await server.close({ reason: "live test complete" });
|
||||
await fs.rm(toolProbePath, { force: true });
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
|
||||
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
|
||||
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
|
||||
process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
|
||||
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
|
||||
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
|
||||
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
|
||||
}
|
||||
}
|
||||
|
||||
describeLive("gateway live (dev agent, profile keys)", () => {
|
||||
it(
|
||||
"runs meaningful prompts across models with available keys",
|
||||
async () => {
|
||||
const previous = {
|
||||
configPath: process.env.CLAWDBOT_CONFIG_PATH,
|
||||
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
|
||||
skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
|
||||
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
|
||||
skipCron: process.env.CLAWDBOT_SKIP_CRON,
|
||||
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
|
||||
};
|
||||
|
||||
process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
|
||||
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
|
||||
process.env.CLAWDBOT_SKIP_CRON = "1";
|
||||
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
|
||||
|
||||
const token = `test-${randomUUID()}`;
|
||||
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
|
||||
|
||||
const cfg = loadConfig();
|
||||
await ensureClawdbotModelsJson(cfg);
|
||||
|
||||
const workspaceDir = resolveUserPath(
|
||||
cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
|
||||
);
|
||||
await fs.mkdir(workspaceDir, { recursive: true });
|
||||
const nonceA = randomUUID();
|
||||
const nonceB = randomUUID();
|
||||
const toolProbePath = path.join(
|
||||
workspaceDir,
|
||||
`.clawdbot-live-tool-probe.${nonceA}.txt`,
|
||||
);
|
||||
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
|
||||
|
||||
const agentDir = resolveClawdbotAgentDir();
|
||||
const authStorage = discoverAuthStorage(agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, agentDir);
|
||||
const all = modelRegistry.getAll() as Array<Model<Api>>;
|
||||
|
||||
const filter = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_MODELS);
|
||||
|
||||
// Default: honor user allowlist. Opt-in: scan all models with keys.
|
||||
const allowlistKeys = Object.keys(cfg.agents?.defaults?.models ?? {});
|
||||
const wanted =
|
||||
ALL_MODELS || allowlistKeys.length === 0
|
||||
? all
|
||||
: all.filter((m) => allowlistKeys.includes(`${m.provider}/${m.id}`));
|
||||
const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim();
|
||||
const useModern =
|
||||
!rawModels || rawModels === "modern" || rawModels === "all";
|
||||
const useExplicit = Boolean(rawModels) && !useModern;
|
||||
const filter = useExplicit ? parseFilter(rawModels) : null;
|
||||
const wanted = filter
|
||||
? all.filter((m) => filter.has(`${m.provider}/${m.id}`))
|
||||
: all.filter((m) =>
|
||||
isModernModelRef({ provider: m.provider, id: m.id }),
|
||||
);
|
||||
|
||||
const candidates: Array<Model<Api>> = [];
|
||||
for (const model of wanted) {
|
||||
const id = `${model.provider}/${model.id}`;
|
||||
if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
|
||||
if (filter && !filter.has(id)) continue;
|
||||
try {
|
||||
// eslint-disable-next-line no-await-in-loop
|
||||
await getApiKeyForModel({ model, cfg });
|
||||
@@ -264,315 +697,72 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||
}
|
||||
}
|
||||
|
||||
expect(candidates.length).toBeGreaterThan(0);
|
||||
const imageCandidates = EXTRA_IMAGE_PROBES
|
||||
? candidates.filter((m) => m.input?.includes("image"))
|
||||
: [];
|
||||
if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) {
|
||||
throw new Error(
|
||||
"image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model",
|
||||
if (candidates.length === 0) {
|
||||
logProgress("[all-models] no API keys found; skipping");
|
||||
return;
|
||||
}
|
||||
logProgress(
|
||||
`[all-models] selection=${useExplicit ? "explicit" : "modern"}`,
|
||||
);
|
||||
const imageCandidates = candidates.filter((m) =>
|
||||
m.input?.includes("image"),
|
||||
);
|
||||
if (imageCandidates.length === 0) {
|
||||
logProgress(
|
||||
"[all-models] no image-capable models selected; image probe will be skipped",
|
||||
);
|
||||
}
|
||||
|
||||
// Build a temp config that allows all selected models, so session overrides stick.
|
||||
const lmstudioProvider = cfg.models?.providers?.lmstudio;
|
||||
const nextCfg = {
|
||||
...cfg,
|
||||
agents: {
|
||||
...cfg.agents,
|
||||
list: (cfg.agents?.list ?? []).map((entry) => ({
|
||||
...entry,
|
||||
sandbox: { mode: "off" },
|
||||
})),
|
||||
defaults: {
|
||||
...cfg.agents?.defaults,
|
||||
// Live tests should avoid Docker sandboxing so tool probes can
|
||||
// operate on the temporary probe files we create in the host workspace.
|
||||
sandbox: { mode: "off" },
|
||||
models: Object.fromEntries(
|
||||
candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
|
||||
),
|
||||
},
|
||||
},
|
||||
models: {
|
||||
...cfg.models,
|
||||
providers: {
|
||||
...cfg.models?.providers,
|
||||
// LM Studio is most reliable via Chat Completions; its Responses API
|
||||
// tool-calling behavior is inconsistent across releases.
|
||||
...(lmstudioProvider
|
||||
? {
|
||||
lmstudio: {
|
||||
...lmstudioProvider,
|
||||
api: "openai-completions",
|
||||
},
|
||||
}
|
||||
: {}),
|
||||
},
|
||||
},
|
||||
};
|
||||
const tempDir = await fs.mkdtemp(
|
||||
path.join(os.tmpdir(), "clawdbot-live-"),
|
||||
);
|
||||
const tempConfigPath = path.join(tempDir, "clawdbot.json");
|
||||
await fs.writeFile(
|
||||
tempConfigPath,
|
||||
`${JSON.stringify(nextCfg, null, 2)}\n`,
|
||||
);
|
||||
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
|
||||
|
||||
const port = await getFreeGatewayPort();
|
||||
const server = await startGatewayServer(port, {
|
||||
bind: "loopback",
|
||||
auth: { mode: "token", token },
|
||||
controlUiEnabled: false,
|
||||
await runGatewayModelSuite({
|
||||
label: "all-models",
|
||||
cfg,
|
||||
candidates,
|
||||
extraToolProbes: true,
|
||||
extraImageProbes: true,
|
||||
thinkingLevel: THINKING_LEVEL,
|
||||
});
|
||||
|
||||
const client = await connectClient({
|
||||
url: `ws://127.0.0.1:${port}`,
|
||||
token,
|
||||
const minimaxCandidates = candidates.filter((model) => model.provider === "minimax");
|
||||
if (minimaxCandidates.length === 0) {
|
||||
logProgress("[minimax] no candidates with keys; skipping dual endpoint probes");
|
||||
return;
|
||||
}
|
||||
|
||||
const minimaxOpenAi = buildMinimaxProviderOverride({
|
||||
cfg,
|
||||
api: "openai-completions",
|
||||
baseUrl: "https://api.minimax.io/v1",
|
||||
});
|
||||
if (minimaxOpenAi) {
|
||||
await runGatewayModelSuite({
|
||||
label: "minimax-openai",
|
||||
cfg,
|
||||
candidates: minimaxCandidates,
|
||||
extraToolProbes: true,
|
||||
extraImageProbes: true,
|
||||
thinkingLevel: THINKING_LEVEL,
|
||||
providerOverrides: { minimax: minimaxOpenAi },
|
||||
});
|
||||
} else {
|
||||
logProgress("[minimax-openai] missing minimax provider config; skipping");
|
||||
}
|
||||
|
||||
try {
|
||||
const sessionKey = "agent:dev:live-gateway";
|
||||
|
||||
const failures: Array<{ model: string; error: string }> = [];
|
||||
|
||||
for (const model of candidates) {
|
||||
const modelKey = `${model.provider}/${model.id}`;
|
||||
|
||||
try {
|
||||
// Ensure session exists + override model for this run.
|
||||
await client.request<Record<string, unknown>>("sessions.patch", {
|
||||
key: sessionKey,
|
||||
model: modelKey,
|
||||
});
|
||||
// Reset between models: avoids cross-provider transcript incompatibilities
|
||||
// (notably OpenAI Responses requiring reasoning replay for function_call items).
|
||||
await client.request<Record<string, unknown>>("sessions.reset", {
|
||||
key: sessionKey,
|
||||
});
|
||||
|
||||
// “Meaningful” direct prompt (no tools).
|
||||
const runId = randomUUID();
|
||||
const payload = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId}`,
|
||||
message:
|
||||
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
|
||||
if (payload?.status !== "ok") {
|
||||
throw new Error(`agent status=${String(payload?.status)}`);
|
||||
}
|
||||
const text = extractPayloadText(payload?.result);
|
||||
if (
|
||||
model.provider === "google" &&
|
||||
isGoogleModelNotFoundText(text)
|
||||
) {
|
||||
// Catalog drift: model IDs can disappear or become unavailable on the API.
|
||||
// Treat as skip when scanning "all models" for Google.
|
||||
continue;
|
||||
}
|
||||
if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
|
||||
if (
|
||||
!/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
|
||||
!/\bmacro\s*-?\s*tasks?\b/i.test(text)
|
||||
) {
|
||||
throw new Error(`missing required keywords: ${text}`);
|
||||
}
|
||||
|
||||
// Real tool invocation: force the agent to Read a local file and echo a nonce.
|
||||
const runIdTool = randomUUID();
|
||||
const toolProbe = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdTool}-tool`,
|
||||
message:
|
||||
"Clawdbot live tool probe (local, safe): " +
|
||||
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||
"Then reply with the two nonce values you read (include both).",
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (toolProbe?.status !== "ok") {
|
||||
throw new Error(
|
||||
`tool probe failed: status=${String(toolProbe?.status)}`,
|
||||
);
|
||||
}
|
||||
const toolText = extractPayloadText(toolProbe?.result);
|
||||
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
||||
throw new Error(`tool probe missing nonce: ${toolText}`);
|
||||
}
|
||||
|
||||
if (EXTRA_TOOL_PROBES) {
|
||||
const nonceC = randomUUID();
|
||||
const toolWritePath = path.join(
|
||||
tempDir,
|
||||
`write-${runIdTool}.txt`,
|
||||
);
|
||||
|
||||
const execReadProbe = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdTool}-exec-read`,
|
||||
message:
|
||||
"Clawdbot live tool probe (local, safe): " +
|
||||
"use the tool named `exec` (or `Exec`) to run this command: " +
|
||||
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
|
||||
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
|
||||
"Finally reply including the nonce text you read back.",
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (execReadProbe?.status !== "ok") {
|
||||
throw new Error(
|
||||
`exec+read probe failed: status=${String(execReadProbe?.status)}`,
|
||||
);
|
||||
}
|
||||
const execReadText = extractPayloadText(execReadProbe?.result);
|
||||
if (!execReadText.includes(nonceC)) {
|
||||
throw new Error(
|
||||
`exec+read probe missing nonce: ${execReadText}`,
|
||||
);
|
||||
}
|
||||
|
||||
await fs.rm(toolWritePath, { force: true });
|
||||
}
|
||||
|
||||
if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) {
|
||||
const imageCode = randomImageProbeCode(10);
|
||||
const imageBase64 = renderCatNoncePngBase64(imageCode);
|
||||
const runIdImage = randomUUID();
|
||||
|
||||
const imageProbe = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runIdImage}-image`,
|
||||
message:
|
||||
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
|
||||
"(1) the animal shown or written in the image, lowercase; " +
|
||||
"(2) the code printed in the image, uppercase. No extra text.",
|
||||
attachments: [
|
||||
{
|
||||
mimeType: "image/png",
|
||||
fileName: `probe-${runIdImage}.png`,
|
||||
content: imageBase64,
|
||||
},
|
||||
],
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (imageProbe?.status !== "ok") {
|
||||
throw new Error(
|
||||
`image probe failed: status=${String(imageProbe?.status)}`,
|
||||
);
|
||||
}
|
||||
const imageText = extractPayloadText(imageProbe?.result);
|
||||
if (!/\bcat\b/i.test(imageText)) {
|
||||
throw new Error(`image probe missing 'cat': ${imageText}`);
|
||||
}
|
||||
const candidates =
|
||||
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
|
||||
const bestDistance = candidates.reduce((best, cand) => {
|
||||
if (Math.abs(cand.length - imageCode.length) > 2) return best;
|
||||
return Math.min(best, editDistance(cand, imageCode));
|
||||
}, Number.POSITIVE_INFINITY);
|
||||
if (!(bestDistance <= 2)) {
|
||||
throw new Error(
|
||||
`image probe missing code (${imageCode}): ${imageText}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
|
||||
if (
|
||||
(model.provider === "openai" &&
|
||||
model.api === "openai-responses") ||
|
||||
(model.provider === "openai-codex" &&
|
||||
model.api === "openai-codex-responses")
|
||||
) {
|
||||
const runId2 = randomUUID();
|
||||
const first = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId2}-1`,
|
||||
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (first?.status !== "ok") {
|
||||
throw new Error(
|
||||
`tool-only turn failed: status=${String(first?.status)}`,
|
||||
);
|
||||
}
|
||||
|
||||
const second = await client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey,
|
||||
idempotencyKey: `idem-${runId2}-2`,
|
||||
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (second?.status !== "ok") {
|
||||
throw new Error(
|
||||
`post-tool message failed: status=${String(second?.status)}`,
|
||||
);
|
||||
}
|
||||
const reply = extractPayloadText(second?.result);
|
||||
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
|
||||
throw new Error(`unexpected reply: ${reply}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
const message = String(err);
|
||||
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
|
||||
if (
|
||||
model.provider === "openai-codex" &&
|
||||
isRefreshTokenReused(message)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
failures.push({ model: modelKey, error: message });
|
||||
}
|
||||
}
|
||||
|
||||
if (failures.length > 0) {
|
||||
const preview = failures
|
||||
.slice(0, 20)
|
||||
.map((f) => `- ${f.model}: ${f.error}`)
|
||||
.join("\n");
|
||||
throw new Error(
|
||||
`gateway live model failures (${failures.length}):\n${preview}`,
|
||||
);
|
||||
}
|
||||
} finally {
|
||||
client.stop();
|
||||
await server.close({ reason: "live test complete" });
|
||||
await fs.rm(toolProbePath, { force: true });
|
||||
await fs.rm(tempDir, { recursive: true, force: true });
|
||||
|
||||
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
|
||||
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
|
||||
process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
|
||||
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
|
||||
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
|
||||
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
|
||||
const minimaxAnthropic = buildMinimaxProviderOverride({
|
||||
cfg,
|
||||
api: "anthropic-messages",
|
||||
baseUrl: "https://api.minimax.io/anthropic",
|
||||
});
|
||||
if (minimaxAnthropic) {
|
||||
await runGatewayModelSuite({
|
||||
label: "minimax-anthropic",
|
||||
cfg,
|
||||
candidates: minimaxCandidates,
|
||||
extraToolProbes: true,
|
||||
extraImageProbes: true,
|
||||
thinkingLevel: THINKING_LEVEL,
|
||||
providerOverrides: { minimax: minimaxAnthropic },
|
||||
});
|
||||
} else {
|
||||
logProgress("[minimax-anthropic] missing minimax provider config; skipping");
|
||||
}
|
||||
},
|
||||
20 * 60 * 1000,
|
||||
@@ -661,6 +851,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||
message:
|
||||
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
|
||||
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
|
||||
thinking: THINKING_LEVEL,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
@@ -671,6 +862,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||
);
|
||||
}
|
||||
const toolText = extractPayloadText(toolProbe?.result);
|
||||
assertNoReasoningTags({
|
||||
text: toolText,
|
||||
model: "anthropic/claude-opus-4-5",
|
||||
phase: "zai-fallback-tool",
|
||||
label: "zai-fallback",
|
||||
});
|
||||
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
|
||||
throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
|
||||
}
|
||||
@@ -689,6 +886,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||
message:
|
||||
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
|
||||
`Reply with exactly: ${nonceA} ${nonceB}.`,
|
||||
thinking: THINKING_LEVEL,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
@@ -699,6 +897,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
|
||||
);
|
||||
}
|
||||
const followupText = extractPayloadText(followup?.result);
|
||||
assertNoReasoningTags({
|
||||
text: followupText,
|
||||
model: "zai/glm-4.7",
|
||||
phase: "zai-fallback-followup",
|
||||
label: "zai-fallback",
|
||||
});
|
||||
if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
|
||||
throw new Error(`zai followup missing nonce: ${followupText}`);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user