Files
clawdbot/src/gateway/gateway-models.profiles.live.test.ts
Gustavo Madeira Santana acb523de86 CLI: streamline startup paths and env parsing
Add shared parseBooleanValue()/isTruthyEnvValue() and apply across CLI, gateway, memory, and live-test flags for consistent env handling.
Introduce route-first fast paths, lazy subcommand registration, and deferred plugin loading to reduce CLI startup overhead.
Centralize config validation via ensureConfigReady() and add config caching/deferred shell env fallback for fewer IO passes.
Harden logger initialization/imports and add focused tests for argv, boolean parsing, frontmatter, and CLI subcommands.
2026-01-18 23:10:39 +00:00

1063 lines
39 KiB
TypeScript

import { randomBytes, randomUUID } from "node:crypto";
import fs from "node:fs/promises";
import { createServer } from "node:net";
import os from "node:os";
import path from "node:path";
import type { Api, Model } from "@mariozechner/pi-ai";
import { discoverAuthStorage, discoverModels } from "@mariozechner/pi-coding-agent";
import { describe, it } from "vitest";
import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
import { resolveAgentWorkspaceDir } from "../agents/agent-scope.js";
import {
type AuthProfileStore,
ensureAuthProfileStore,
saveAuthProfileStore,
} from "../agents/auth-profiles.js";
import {
collectAnthropicApiKeys,
isAnthropicBillingError,
isAnthropicRateLimitError,
} from "../agents/live-auth-keys.js";
import { isModernModelRef } from "../agents/live-model-filter.js";
import { getApiKeyForModel } from "../agents/model-auth.js";
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
import { loadConfig } from "../config/config.js";
import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js";
import { isTruthyEnvValue } from "../infra/env.js";
import { DEFAULT_AGENT_ID } from "../routing/session-key.js";
import { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } from "../utils/message-channel.js";
import { GatewayClient } from "./client.js";
import { renderCatNoncePngBase64 } from "./live-image-probe.js";
import { startGatewayServer } from "./server.js";
const LIVE =
isTruthyEnvValue(process.env.LIVE) || isTruthyEnvValue(process.env.CLAWDBOT_LIVE_TEST);
const GATEWAY_LIVE = isTruthyEnvValue(process.env.CLAWDBOT_LIVE_GATEWAY);
const ZAI_FALLBACK = isTruthyEnvValue(process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK);
const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
const THINKING_LEVEL = "high";
const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
function parseFilter(raw?: string): Set<string> | null {
const trimmed = raw?.trim();
if (!trimmed || trimmed === "all") return null;
const ids = trimmed
.split(",")
.map((s) => s.trim())
.filter(Boolean);
return ids.length ? new Set(ids) : null;
}
function logProgress(message: string): void {
console.log(`[live] ${message}`);
}
function assertNoReasoningTags(params: {
text: string;
model: string;
phase: string;
label: string;
}): void {
if (!params.text) return;
if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) {
const snippet = params.text.length > 200 ? `${params.text.slice(0, 200)}` : params.text;
throw new Error(
`[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`,
);
}
}
function extractPayloadText(result: unknown): string {
const record = result as Record<string, unknown>;
const payloads = Array.isArray(record.payloads) ? record.payloads : [];
const texts = payloads
.map((p) => (p && typeof p === "object" ? (p as Record<string, unknown>).text : undefined))
.filter((t): t is string => typeof t === "string" && t.trim().length > 0);
return texts.join("\n").trim();
}
function isMeaningful(text: string): boolean {
if (!text) return false;
const trimmed = text.trim();
if (trimmed.toLowerCase() === "ok") return false;
if (trimmed.length < 60) return false;
const words = trimmed.split(/\s+/g).filter(Boolean);
if (words.length < 12) return false;
return true;
}
function isGoogleModelNotFoundText(text: string): boolean {
const trimmed = text.trim();
if (!trimmed) return false;
if (!/not found/i.test(trimmed)) return false;
if (/models\/.+ is not found for api version/i.test(trimmed)) return true;
if (/"status"\s*:\s*"NOT_FOUND"/.test(trimmed)) return true;
if (/"code"\s*:\s*404/.test(trimmed)) return true;
return false;
}
function isGoogleishProvider(provider: string): boolean {
return provider === "google" || provider.startsWith("google-");
}
function isRefreshTokenReused(error: string): boolean {
return /refresh_token_reused/i.test(error);
}
function isChatGPTUsageLimitErrorMessage(raw: string): boolean {
const msg = raw.toLowerCase();
return msg.includes("hit your chatgpt usage limit") && msg.includes("try again in");
}
function isMissingProfileError(error: string): boolean {
return /no credentials found for profile/i.test(error);
}
function isEmptyStreamText(text: string): boolean {
return text.includes("request ended without sending any chunks");
}
function randomImageProbeCode(len = 6): string {
// Chosen to avoid common OCR confusions in our 5x7 bitmap font.
// Notably: 0↔8, B↔8, 6↔9, 3↔B, D↔0.
// Must stay within the glyph set in `src/gateway/live-image-probe.ts`.
const alphabet = "24567ACEF";
const bytes = randomBytes(len);
let out = "";
for (let i = 0; i < len; i += 1) {
out += alphabet[bytes[i] % alphabet.length];
}
return out;
}
function editDistance(a: string, b: string): number {
if (a === b) return 0;
const aLen = a.length;
const bLen = b.length;
if (aLen === 0) return bLen;
if (bLen === 0) return aLen;
let prev = Array.from({ length: bLen + 1 }, (_v, idx) => idx);
let curr = Array.from({ length: bLen + 1 }, () => 0);
for (let i = 1; i <= aLen; i += 1) {
curr[0] = i;
const aCh = a.charCodeAt(i - 1);
for (let j = 1; j <= bLen; j += 1) {
const cost = aCh === b.charCodeAt(j - 1) ? 0 : 1;
curr[j] = Math.min(
prev[j] + 1, // delete
curr[j - 1] + 1, // insert
prev[j - 1] + cost, // substitute
);
}
[prev, curr] = [curr, prev];
}
return prev[bLen] ?? Number.POSITIVE_INFINITY;
}
async function getFreePort(): Promise<number> {
return await new Promise((resolve, reject) => {
const srv = createServer();
srv.on("error", reject);
srv.listen(0, "127.0.0.1", () => {
const addr = srv.address();
if (!addr || typeof addr === "string") {
srv.close();
reject(new Error("failed to acquire free port"));
return;
}
const port = addr.port;
srv.close((err) => {
if (err) reject(err);
else resolve(port);
});
});
});
}
async function isPortFree(port: number): Promise<boolean> {
if (!Number.isFinite(port) || port <= 0 || port > 65535) return false;
return await new Promise((resolve) => {
const srv = createServer();
srv.once("error", () => resolve(false));
srv.listen(port, "127.0.0.1", () => {
srv.close(() => resolve(true));
});
});
}
async function getFreeGatewayPort(): Promise<number> {
// Gateway uses derived ports (bridge/browser/canvas). Avoid flaky collisions by
// ensuring the common derived offsets are free too.
for (let attempt = 0; attempt < 25; attempt += 1) {
const port = await getFreePort();
const candidates = [port, port + 1, port + 2, port + 4];
const ok = (await Promise.all(candidates.map((candidate) => isPortFree(candidate)))).every(
Boolean,
);
if (ok) return port;
}
throw new Error("failed to acquire a free gateway port block");
}
type AgentFinalPayload = {
status?: unknown;
result?: unknown;
};
async function connectClient(params: { url: string; token: string }) {
return await new Promise<GatewayClient>((resolve, reject) => {
let settled = false;
const stop = (err?: Error, client?: GatewayClient) => {
if (settled) return;
settled = true;
clearTimeout(timer);
if (err) reject(err);
else resolve(client as GatewayClient);
};
const client = new GatewayClient({
url: params.url,
token: params.token,
clientName: GATEWAY_CLIENT_NAMES.TEST,
clientDisplayName: "vitest-live",
clientVersion: "dev",
mode: GATEWAY_CLIENT_MODES.TEST,
onHelloOk: () => stop(undefined, client),
onConnectError: (err) => stop(err),
onClose: (code, reason) =>
stop(new Error(`gateway closed during connect (${code}): ${reason}`)),
});
const timer = setTimeout(() => stop(new Error("gateway connect timeout")), 10_000);
timer.unref();
client.start();
});
}
type GatewayModelSuiteParams = {
label: string;
cfg: ClawdbotConfig;
candidates: Array<Model<Api>>;
extraToolProbes: boolean;
extraImageProbes: boolean;
thinkingLevel: string;
providerOverrides?: Record<string, ModelProviderConfig>;
};
function buildLiveGatewayConfig(params: {
cfg: ClawdbotConfig;
candidates: Array<Model<Api>>;
providerOverrides?: Record<string, ModelProviderConfig>;
}): ClawdbotConfig {
const providerOverrides = params.providerOverrides ?? {};
const lmstudioProvider = params.cfg.models?.providers?.lmstudio;
const baseProviders = params.cfg.models?.providers ?? {};
const nextProviders = {
...baseProviders,
...(lmstudioProvider
? {
lmstudio: {
...lmstudioProvider,
api: "openai-completions",
},
}
: {}),
...providerOverrides,
};
const providers = Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders;
return {
...params.cfg,
agents: {
...params.cfg.agents,
list: (params.cfg.agents?.list ?? []).map((entry) => ({
...entry,
sandbox: { mode: "off" },
})),
defaults: {
...params.cfg.agents?.defaults,
// Live tests should avoid Docker sandboxing so tool probes can
// operate on the temporary probe files we create in the host workspace.
sandbox: { mode: "off" },
models: Object.fromEntries(params.candidates.map((m) => [`${m.provider}/${m.id}`, {}])),
},
},
models:
Object.keys(providers).length > 0 ? { ...params.cfg.models, providers } : params.cfg.models,
};
}
function sanitizeAuthConfig(params: {
cfg: ClawdbotConfig;
agentDir: string;
}): ClawdbotConfig["auth"] | undefined {
const auth = params.cfg.auth;
if (!auth) return auth;
const store = ensureAuthProfileStore(params.agentDir, {
allowKeychainPrompt: false,
});
let profiles: NonNullable<ClawdbotConfig["auth"]>["profiles"] | undefined;
if (auth.profiles) {
profiles = {};
for (const [profileId, profile] of Object.entries(auth.profiles)) {
if (!store.profiles[profileId]) continue;
profiles[profileId] = profile;
}
if (Object.keys(profiles).length === 0) profiles = undefined;
}
let order: Record<string, string[]> | undefined;
if (auth.order) {
order = {};
for (const [provider, ids] of Object.entries(auth.order)) {
const filtered = ids.filter((id) => Boolean(store.profiles[id]));
if (filtered.length === 0) continue;
order[provider] = filtered;
}
if (Object.keys(order).length === 0) order = undefined;
}
if (!profiles && !order && !auth.cooldowns) return undefined;
return {
...auth,
profiles,
order,
};
}
function buildMinimaxProviderOverride(params: {
cfg: ClawdbotConfig;
api: "openai-completions" | "anthropic-messages";
baseUrl: string;
}): ModelProviderConfig | null {
const existing = params.cfg.models?.providers?.minimax;
if (!existing || !Array.isArray(existing.models) || existing.models.length === 0) return null;
return {
...existing,
api: params.api,
baseUrl: params.baseUrl,
};
}
async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
const previous = {
configPath: process.env.CLAWDBOT_CONFIG_PATH,
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
skipChannels: process.env.CLAWDBOT_SKIP_CHANNELS,
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
skipCron: process.env.CLAWDBOT_SKIP_CRON,
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
agentDir: process.env.CLAWDBOT_AGENT_DIR,
piAgentDir: process.env.PI_CODING_AGENT_DIR,
stateDir: process.env.CLAWDBOT_STATE_DIR,
};
let tempAgentDir: string | undefined;
let tempStateDir: string | undefined;
process.env.CLAWDBOT_SKIP_CHANNELS = "1";
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
process.env.CLAWDBOT_SKIP_CRON = "1";
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
const token = `test-${randomUUID()}`;
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
const agentId = "dev";
const hostAgentDir = resolveClawdbotAgentDir();
const hostStore = ensureAuthProfileStore(hostAgentDir, {
allowKeychainPrompt: false,
});
const sanitizedStore: AuthProfileStore = {
version: hostStore.version,
profiles: { ...hostStore.profiles },
// Keep selection state so the gateway picks the same known-good profiles
// as the host (important when some profiles are rate-limited/disabled).
order: hostStore.order ? { ...hostStore.order } : undefined,
lastGood: hostStore.lastGood ? { ...hostStore.lastGood } : undefined,
usageStats: hostStore.usageStats ? { ...hostStore.usageStats } : undefined,
};
tempStateDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-state-"));
process.env.CLAWDBOT_STATE_DIR = tempStateDir;
tempAgentDir = path.join(tempStateDir, "agents", DEFAULT_AGENT_ID, "agent");
saveAuthProfileStore(sanitizedStore, tempAgentDir);
const tempSessionAgentDir = path.join(tempStateDir, "agents", agentId, "agent");
if (tempSessionAgentDir !== tempAgentDir) {
saveAuthProfileStore(sanitizedStore, tempSessionAgentDir);
}
process.env.CLAWDBOT_AGENT_DIR = tempAgentDir;
process.env.PI_CODING_AGENT_DIR = tempAgentDir;
const workspaceDir = resolveAgentWorkspaceDir(params.cfg, agentId);
await fs.mkdir(workspaceDir, { recursive: true });
const nonceA = randomUUID();
const nonceB = randomUUID();
const toolProbePath = path.join(workspaceDir, `.clawdbot-live-tool-probe.${nonceA}.txt`);
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
const agentDir = resolveClawdbotAgentDir();
const sanitizedCfg: ClawdbotConfig = {
...params.cfg,
auth: sanitizeAuthConfig({ cfg: params.cfg, agentDir }),
};
const nextCfg = buildLiveGatewayConfig({
cfg: sanitizedCfg,
candidates: params.candidates,
providerOverrides: params.providerOverrides,
});
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-"));
const tempConfigPath = path.join(tempDir, "clawdbot.json");
await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`);
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
await ensureClawdbotModelsJson(nextCfg);
const port = await getFreeGatewayPort();
const server = await startGatewayServer(port, {
bind: "loopback",
auth: { mode: "token", token },
controlUiEnabled: false,
});
const client = await connectClient({
url: `ws://127.0.0.1:${port}`,
token,
});
try {
logProgress(
`[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`,
);
const anthropicKeys = collectAnthropicApiKeys();
if (anthropicKeys.length > 0) {
process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`);
}
const sessionKey = `agent:${agentId}:${params.label}`;
const failures: Array<{ model: string; error: string }> = [];
let skippedCount = 0;
const total = params.candidates.length;
for (const [index, model] of params.candidates.entries()) {
const modelKey = `${model.provider}/${model.id}`;
const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`;
const attemptMax =
model.provider === "anthropic" && anthropicKeys.length > 0 ? anthropicKeys.length : 1;
for (let attempt = 0; attempt < attemptMax; attempt += 1) {
if (model.provider === "anthropic" && anthropicKeys.length > 0) {
process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
}
try {
// Ensure session exists + override model for this run.
// Reset between models: avoids cross-provider transcript incompatibilities
// (notably OpenAI Responses requiring reasoning replay for function_call items).
await client.request<Record<string, unknown>>("sessions.reset", {
key: sessionKey,
});
await client.request<Record<string, unknown>>("sessions.patch", {
key: sessionKey,
model: modelKey,
});
logProgress(`${progressLabel}: prompt`);
const runId = randomUUID();
const payload = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId}`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (payload?.status !== "ok") {
throw new Error(`agent status=${String(payload?.status)}`);
}
let text = extractPayloadText(payload?.result);
if (!text) {
logProgress(`${progressLabel}: empty response, retrying`);
const retry = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${randomUUID()}-retry`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (retry?.status !== "ok") {
throw new Error(`agent status=${String(retry?.status)}`);
}
text = extractPayloadText(retry?.result);
}
if (!text && isGoogleishProvider(model.provider)) {
logProgress(`${progressLabel}: skip (google empty response)`);
break;
}
if (
isEmptyStreamText(text) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
break;
}
if (isGoogleishProvider(model.provider) && isGoogleModelNotFoundText(text)) {
// Catalog drift: model IDs can disappear or become unavailable on the API.
// Treat as skip when scanning "all models" for Google.
logProgress(`${progressLabel}: skip (google model not found)`);
break;
}
assertNoReasoningTags({
text,
model: modelKey,
phase: "prompt",
label: params.label,
});
if (!isMeaningful(text)) {
if (isGoogleishProvider(model.provider) && /gemini/i.test(model.id)) {
logProgress(`${progressLabel}: skip (google not meaningful)`);
break;
}
throw new Error(`not meaningful: ${text}`);
}
if (!/\bmicro\s*-?\s*tasks?\b/i.test(text) || !/\bmacro\s*-?\s*tasks?\b/i.test(text)) {
throw new Error(`missing required keywords: ${text}`);
}
// Real tool invocation: force the agent to Read a local file and echo a nonce.
logProgress(`${progressLabel}: tool-read`);
const runIdTool = randomUUID();
const toolProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-tool`,
message:
"Clawdbot live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
"Then reply with the two nonce values you read (include both).",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (toolProbe?.status !== "ok") {
throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
}
const toolText = extractPayloadText(toolProbe?.result);
if (
isEmptyStreamText(toolText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
break;
}
assertNoReasoningTags({
text: toolText,
model: modelKey,
phase: "tool-read",
label: params.label,
});
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
throw new Error(`tool probe missing nonce: ${toolText}`);
}
if (params.extraToolProbes) {
logProgress(`${progressLabel}: tool-exec`);
const nonceC = randomUUID();
const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
const execReadProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read`,
message:
"Clawdbot live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (execReadProbe?.status !== "ok") {
throw new Error(`exec+read probe failed: status=${String(execReadProbe?.status)}`);
}
const execReadText = extractPayloadText(execReadProbe?.result);
if (
isEmptyStreamText(execReadText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: skip (${model.provider} empty response)`);
break;
}
assertNoReasoningTags({
text: execReadText,
model: modelKey,
phase: "tool-exec",
label: params.label,
});
if (!execReadText.includes(nonceC)) {
throw new Error(`exec+read probe missing nonce: ${execReadText}`);
}
await fs.rm(toolWritePath, { force: true });
}
if (params.extraImageProbes && model.input?.includes("image")) {
logProgress(`${progressLabel}: image`);
// Shorter code => less OCR flake across providers, still tests image attachments end-to-end.
const imageCode = randomImageProbeCode();
const imageBase64 = renderCatNoncePngBase64(imageCode);
const runIdImage = randomUUID();
const imageProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdImage}-image`,
message:
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
"(1) the animal shown or written in the image, lowercase; " +
"(2) the code printed in the image, uppercase. No extra text.",
attachments: [
{
mimeType: "image/png",
fileName: `probe-${runIdImage}.png`,
content: imageBase64,
},
],
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
// Best-effort: do not fail the whole live suite on flaky image handling.
// (We still keep prompt + tool probes as hard checks.)
if (imageProbe?.status !== "ok") {
logProgress(`${progressLabel}: image skip (status=${String(imageProbe?.status)})`);
} else {
const imageText = extractPayloadText(imageProbe?.result);
if (
isEmptyStreamText(imageText) &&
(model.provider === "minimax" || model.provider === "openai-codex")
) {
logProgress(`${progressLabel}: image skip (${model.provider} empty response)`);
} else {
assertNoReasoningTags({
text: imageText,
model: modelKey,
phase: "image",
label: params.label,
});
if (!/\bcat\b/i.test(imageText)) {
logProgress(`${progressLabel}: image skip (missing 'cat')`);
} else {
const candidates = imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
const bestDistance = candidates.reduce((best, cand) => {
if (Math.abs(cand.length - imageCode.length) > 2) return best;
return Math.min(best, editDistance(cand, imageCode));
}, Number.POSITIVE_INFINITY);
// OCR / image-read flake: allow a small edit distance, but still require the "cat" token above.
if (!(bestDistance <= 3)) {
logProgress(`${progressLabel}: image skip (code mismatch)`);
}
}
}
}
}
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
if (
(model.provider === "openai" && model.api === "openai-responses") ||
(model.provider === "openai-codex" && model.api === "openai-codex-responses")
) {
logProgress(`${progressLabel}: tool-only regression`);
const runId2 = randomUUID();
const first = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-1`,
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (first?.status !== "ok") {
throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
}
const firstText = extractPayloadText(first?.result);
assertNoReasoningTags({
text: firstText,
model: modelKey,
phase: "tool-only",
label: params.label,
});
const second = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-2`,
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (second?.status !== "ok") {
throw new Error(`post-tool message failed: status=${String(second?.status)}`);
}
const reply = extractPayloadText(second?.result);
assertNoReasoningTags({
text: reply,
model: modelKey,
phase: "tool-only-followup",
label: params.label,
});
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
throw new Error(`unexpected reply: ${reply}`);
}
}
logProgress(`${progressLabel}: done`);
break;
} catch (err) {
const message = String(err);
if (
model.provider === "anthropic" &&
isAnthropicRateLimitError(message) &&
attempt + 1 < attemptMax
) {
logProgress(`${progressLabel}: rate limit, retrying with next key`);
continue;
}
if (model.provider === "anthropic" && isAnthropicBillingError(message)) {
if (attempt + 1 < attemptMax) {
logProgress(`${progressLabel}: billing issue, retrying with next key`);
continue;
}
logProgress(`${progressLabel}: skip (anthropic billing)`);
break;
}
if (
model.provider === "anthropic" &&
isEmptyStreamText(message) &&
attempt + 1 < attemptMax
) {
logProgress(`${progressLabel}: empty response, retrying with next key`);
continue;
}
if (model.provider === "anthropic" && isEmptyStreamText(message)) {
skippedCount += 1;
logProgress(`${progressLabel}: skip (anthropic empty response)`);
break;
}
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
if (model.provider === "openai-codex" && isRefreshTokenReused(message)) {
logProgress(`${progressLabel}: skip (codex refresh token reused)`);
break;
}
if (model.provider === "openai-codex" && isChatGPTUsageLimitErrorMessage(message)) {
logProgress(`${progressLabel}: skip (chatgpt usage limit)`);
break;
}
if (isMissingProfileError(message)) {
skippedCount += 1;
logProgress(`${progressLabel}: skip (missing auth profile)`);
break;
}
if (params.label.startsWith("minimax-")) {
skippedCount += 1;
logProgress(`${progressLabel}: skip (minimax endpoint error)`);
break;
}
logProgress(`${progressLabel}: failed`);
failures.push({ model: modelKey, error: message });
break;
}
}
}
if (failures.length > 0) {
const preview = failures
.slice(0, 20)
.map((f) => `- ${f.model}: ${f.error}`)
.join("\n");
throw new Error(`gateway live model failures (${failures.length}):\n${preview}`);
}
if (skippedCount === total) {
logProgress(`[${params.label}] skipped all models (missing profiles)`);
}
} finally {
client.stop();
await server.close({ reason: "live test complete" });
await fs.rm(toolProbePath, { force: true });
await fs.rm(tempDir, { recursive: true, force: true });
if (tempAgentDir) {
await fs.rm(tempAgentDir, { recursive: true, force: true });
}
if (tempStateDir) {
await fs.rm(tempStateDir, { recursive: true, force: true });
}
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
process.env.CLAWDBOT_SKIP_CHANNELS = previous.skipChannels;
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
process.env.CLAWDBOT_AGENT_DIR = previous.agentDir;
process.env.PI_CODING_AGENT_DIR = previous.piAgentDir;
process.env.CLAWDBOT_STATE_DIR = previous.stateDir;
}
}
describeLive("gateway live (dev agent, profile keys)", () => {
it(
"runs meaningful prompts across models with available keys",
async () => {
const cfg = loadConfig();
await ensureClawdbotModelsJson(cfg);
const agentDir = resolveClawdbotAgentDir();
const authStore = ensureAuthProfileStore(agentDir, {
allowKeychainPrompt: false,
});
const authStorage = discoverAuthStorage(agentDir);
const modelRegistry = discoverModels(authStorage, agentDir);
const all = modelRegistry.getAll() as Array<Model<Api>>;
const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim();
const useModern = !rawModels || rawModels === "modern" || rawModels === "all";
const useExplicit = Boolean(rawModels) && !useModern;
const filter = useExplicit ? parseFilter(rawModels) : null;
const wanted = filter
? all.filter((m) => filter.has(`${m.provider}/${m.id}`))
: all.filter((m) => isModernModelRef({ provider: m.provider, id: m.id }));
const candidates: Array<Model<Api>> = [];
for (const model of wanted) {
if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
try {
// eslint-disable-next-line no-await-in-loop
const apiKeyInfo = await getApiKeyForModel({
model,
cfg,
store: authStore,
agentDir,
});
if (!apiKeyInfo.source.startsWith("profile:")) {
continue;
}
candidates.push(model);
} catch {
// no creds; skip
}
}
if (candidates.length === 0) {
logProgress("[all-models] no API keys found; skipping");
return;
}
logProgress(`[all-models] selection=${useExplicit ? "explicit" : "modern"}`);
const imageCandidates = candidates.filter((m) => m.input?.includes("image"));
if (imageCandidates.length === 0) {
logProgress("[all-models] no image-capable models selected; image probe will be skipped");
}
await runGatewayModelSuite({
label: "all-models",
cfg,
candidates,
extraToolProbes: true,
extraImageProbes: true,
thinkingLevel: THINKING_LEVEL,
});
const minimaxCandidates = candidates.filter((model) => model.provider === "minimax");
if (minimaxCandidates.length === 0) {
logProgress("[minimax] no candidates with keys; skipping dual endpoint probes");
return;
}
const minimaxAnthropic = buildMinimaxProviderOverride({
cfg,
api: "anthropic-messages",
baseUrl: "https://api.minimax.io/anthropic",
});
if (minimaxAnthropic) {
await runGatewayModelSuite({
label: "minimax-anthropic",
cfg,
candidates: minimaxCandidates,
extraToolProbes: true,
extraImageProbes: true,
thinkingLevel: THINKING_LEVEL,
providerOverrides: { minimax: minimaxAnthropic },
});
} else {
logProgress("[minimax-anthropic] missing minimax provider config; skipping");
}
},
20 * 60 * 1000,
);
it("z.ai fallback handles anthropic tool history", async () => {
if (!ZAI_FALLBACK) return;
const previous = {
configPath: process.env.CLAWDBOT_CONFIG_PATH,
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
skipChannels: process.env.CLAWDBOT_SKIP_CHANNELS,
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
skipCron: process.env.CLAWDBOT_SKIP_CRON,
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
};
process.env.CLAWDBOT_SKIP_CHANNELS = "1";
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
process.env.CLAWDBOT_SKIP_CRON = "1";
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
const token = `test-${randomUUID()}`;
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
const cfg = loadConfig();
await ensureClawdbotModelsJson(cfg);
const agentDir = resolveClawdbotAgentDir();
const authStorage = discoverAuthStorage(agentDir);
const modelRegistry = discoverModels(authStorage, agentDir);
const anthropic = modelRegistry.find("anthropic", "claude-opus-4-5") as Model<Api> | null;
const zai = modelRegistry.find("zai", "glm-4.7") as Model<Api> | null;
if (!anthropic || !zai) return;
try {
await getApiKeyForModel({ model: anthropic, cfg });
await getApiKeyForModel({ model: zai, cfg });
} catch {
return;
}
const agentId = "dev";
const workspaceDir = resolveAgentWorkspaceDir(cfg, agentId);
await fs.mkdir(workspaceDir, { recursive: true });
const nonceA = randomUUID();
const nonceB = randomUUID();
const toolProbePath = path.join(workspaceDir, `.clawdbot-live-zai-fallback.${nonceA}.txt`);
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
const port = await getFreeGatewayPort();
const server = await startGatewayServer({
configPath: cfg.__meta?.path,
port,
token,
});
const client = await connectClient({
url: `ws://127.0.0.1:${port}`,
token,
});
try {
const sessionKey = `agent:${agentId}:live-zai-fallback`;
await client.request<Record<string, unknown>>("sessions.patch", {
key: sessionKey,
model: "anthropic/claude-opus-4-5",
});
await client.request<Record<string, unknown>>("sessions.reset", {
key: sessionKey,
});
const runId = randomUUID();
const toolProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId}-tool`,
message:
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
thinking: THINKING_LEVEL,
deliver: false,
},
{ expectFinal: true },
);
if (toolProbe?.status !== "ok") {
throw new Error(`anthropic tool probe failed: status=${String(toolProbe?.status)}`);
}
const toolText = extractPayloadText(toolProbe?.result);
assertNoReasoningTags({
text: toolText,
model: "anthropic/claude-opus-4-5",
phase: "zai-fallback-tool",
label: "zai-fallback",
});
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
}
await client.request<Record<string, unknown>>("sessions.patch", {
key: sessionKey,
model: "zai/glm-4.7",
});
const followupId = randomUUID();
const followup = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${followupId}-followup`,
message:
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
`Reply with exactly: ${nonceA} ${nonceB}.`,
thinking: THINKING_LEVEL,
deliver: false,
},
{ expectFinal: true },
);
if (followup?.status !== "ok") {
throw new Error(`zai followup failed: status=${String(followup?.status)}`);
}
const followupText = extractPayloadText(followup?.result);
assertNoReasoningTags({
text: followupText,
model: "zai/glm-4.7",
phase: "zai-fallback-followup",
label: "zai-fallback",
});
if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
throw new Error(`zai followup missing nonce: ${followupText}`);
}
} finally {
client.stop();
await server.close({ reason: "live test complete" });
await fs.rm(toolProbePath, { force: true });
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
process.env.CLAWDBOT_SKIP_CHANNELS = previous.skipChannels;
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
}
}, 180_000);
});