fix: modernize live tests and gemini ids

This commit is contained in:
Peter Steinberger
2026-01-12 06:58:31 +00:00
parent 79cbb20988
commit 1850013cae
11 changed files with 1053 additions and 593 deletions

View File

@@ -0,0 +1,50 @@
const KEY_SPLIT_RE = /[\s,;]+/g;
function parseKeyList(raw?: string | null): string[] {
if (!raw) return [];
return raw
.split(KEY_SPLIT_RE)
.map((value) => value.trim())
.filter(Boolean);
}
function collectEnvPrefixedKeys(prefix: string): string[] {
const keys: string[] = [];
for (const [name, value] of Object.entries(process.env)) {
if (!name.startsWith(prefix)) continue;
const trimmed = value?.trim();
if (!trimmed) continue;
keys.push(trimmed);
}
return keys;
}
export function collectAnthropicApiKeys(): string[] {
const forcedSingle = process.env.CLAWDBOT_LIVE_ANTHROPIC_KEY?.trim();
if (forcedSingle) return [forcedSingle];
const fromList = parseKeyList(process.env.CLAWDBOT_LIVE_ANTHROPIC_KEYS);
const fromEnv = collectEnvPrefixedKeys("ANTHROPIC_API_KEY");
const primary = process.env.ANTHROPIC_API_KEY?.trim();
const seen = new Set<string>();
const add = (value?: string) => {
if (!value) return;
if (seen.has(value)) return;
seen.add(value);
};
for (const value of fromList) add(value);
if (primary) add(primary);
for (const value of fromEnv) add(value);
return Array.from(seen);
}
export function isAnthropicRateLimitError(message: string): boolean {
const lower = message.toLowerCase();
if (lower.includes("rate_limit")) return true;
if (lower.includes("rate limit")) return true;
if (lower.includes("429")) return true;
return false;
}

View File

@@ -0,0 +1,89 @@
export type ModelRef = {
provider?: string | null;
id?: string | null;
};
const ANTHROPIC_PREFIXES = [
"claude-opus-4-5",
"claude-sonnet-4-5",
"claude-haiku-4-5",
];
const OPENAI_MODELS = ["gpt-5.2", "gpt-5.0"];
const CODEX_MODELS = [
"gpt-5.2",
"gpt-5.2-codex",
"gpt-5.1-codex",
"gpt-5.1-codex-mini",
"gpt-5.1-codex-max",
];
const GOOGLE_PREFIXES = ["gemini-3"];
const ZAI_PREFIXES = ["glm-4.7"];
const MINIMAX_PREFIXES = ["minimax-m2.1"];
const XAI_PREFIXES = ["grok-4"];
function matchesPrefix(id: string, prefixes: string[]): boolean {
return prefixes.some((prefix) => id.startsWith(prefix));
}
function matchesExactOrPrefix(id: string, values: string[]): boolean {
return values.some((value) => id === value || id.startsWith(value));
}
function matchesAny(id: string, values: string[]): boolean {
return values.some((value) => id.includes(value));
}
export function isModernModelRef(ref: ModelRef): boolean {
const provider = ref.provider?.trim().toLowerCase() ?? "";
const id = ref.id?.trim().toLowerCase() ?? "";
if (!provider || !id) return false;
if (provider === "anthropic") {
return matchesPrefix(id, ANTHROPIC_PREFIXES);
}
if (provider === "openai") {
return matchesExactOrPrefix(id, OPENAI_MODELS);
}
if (provider === "openai-codex") {
return matchesExactOrPrefix(id, CODEX_MODELS);
}
if (provider === "google" || provider === "google-gemini-cli") {
return matchesPrefix(id, GOOGLE_PREFIXES);
}
if (provider === "google-antigravity") {
return (
matchesPrefix(id, GOOGLE_PREFIXES) ||
matchesPrefix(id, ANTHROPIC_PREFIXES)
);
}
if (provider === "zai") {
return matchesPrefix(id, ZAI_PREFIXES);
}
if (provider === "minimax") {
return matchesPrefix(id, MINIMAX_PREFIXES);
}
if (provider === "xai") {
return matchesPrefix(id, XAI_PREFIXES);
}
if (provider === "openrouter" || provider === "opencode") {
return matchesAny(id, [
...ANTHROPIC_PREFIXES,
...OPENAI_MODELS,
...CODEX_MODELS,
...GOOGLE_PREFIXES,
...ZAI_PREFIXES,
...MINIMAX_PREFIXES,
...XAI_PREFIXES,
]);
}
return false;
}

View File

@@ -117,4 +117,59 @@ describe("models config", () => {
);
});
});
it("normalizes gemini 3 ids to preview for google providers", async () => {
await withTempHome(async () => {
vi.resetModules();
const { ensureClawdbotModelsJson } = await import("./models-config.js");
const { resolveClawdbotAgentDir } = await import("./agent-paths.js");
const cfg: ClawdbotConfig = {
models: {
providers: {
google: {
baseUrl: "https://generativelanguage.googleapis.com/v1beta",
apiKey: "GEMINI_KEY",
api: "google-generative-ai",
models: [
{
id: "gemini-3-pro",
name: "Gemini 3 Pro",
api: "google-generative-ai",
reasoning: true,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 1048576,
maxTokens: 65536,
},
{
id: "gemini-3-flash",
name: "Gemini 3 Flash",
api: "google-generative-ai",
reasoning: false,
input: ["text", "image"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 1048576,
maxTokens: 65536,
},
],
},
},
},
};
await ensureClawdbotModelsJson(cfg);
const modelPath = path.join(resolveClawdbotAgentDir(), "models.json");
const raw = await fs.readFile(modelPath, "utf8");
const parsed = JSON.parse(raw) as {
providers: Record<string, { models: Array<{ id: string }> }>;
};
const ids = parsed.providers.google?.models?.map((model) => model.id);
expect(ids).toEqual([
"gemini-3-pro-preview",
"gemini-3-flash-preview",
]);
});
});
});

View File

@@ -5,6 +5,7 @@ import { type ClawdbotConfig, loadConfig } from "../config/config.js";
import { resolveClawdbotAgentDir } from "./agent-paths.js";
type ModelsConfig = NonNullable<ClawdbotConfig["models"]>;
type ProviderConfig = NonNullable<ModelsConfig["providers"]>[string];
const DEFAULT_MODE: NonNullable<ModelsConfig["mode"]> = "merge";
@@ -12,6 +13,38 @@ function isRecord(value: unknown): value is Record<string, unknown> {
return Boolean(value && typeof value === "object" && !Array.isArray(value));
}
function normalizeGoogleModelId(id: string): string {
if (id === "gemini-3-pro") return "gemini-3-pro-preview";
if (id === "gemini-3-flash") return "gemini-3-flash-preview";
return id;
}
function normalizeGoogleProvider(provider: ProviderConfig): ProviderConfig {
let mutated = false;
const models = provider.models.map((model) => {
const nextId = normalizeGoogleModelId(model.id);
if (nextId === model.id) return model;
mutated = true;
return { ...model, id: nextId };
});
return mutated ? { ...provider, models } : provider;
}
function normalizeProviders(
providers: ModelsConfig["providers"],
): ModelsConfig["providers"] {
if (!providers) return providers;
let mutated = false;
const next: Record<string, ProviderConfig> = {};
for (const [key, provider] of Object.entries(providers)) {
const normalized =
key === "google" ? normalizeGoogleProvider(provider) : provider;
if (normalized !== provider) mutated = true;
next[key] = normalized;
}
return mutated ? next : providers;
}
async function readJson(pathname: string): Promise<unknown> {
try {
const raw = await fs.readFile(pathname, "utf8");
@@ -53,7 +86,8 @@ export async function ensureClawdbotModelsJson(
}
}
const next = `${JSON.stringify({ providers: mergedProviders }, null, 2)}\n`;
const normalizedProviders = normalizeProviders(mergedProviders);
const next = `${JSON.stringify({ providers: normalizedProviders }, null, 2)}\n`;
try {
existingRaw = await fs.readFile(targetPath, "utf8");
} catch {

View File

@@ -7,24 +7,20 @@ import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
import { loadConfig } from "../config/config.js";
import { resolveClawdbotAgentDir } from "./agent-paths.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
import { getApiKeyForModel } from "./model-auth.js";
import {
buildModelAliasIndex,
parseModelRef,
resolveConfiguredModelRef,
resolveModelRefFromString,
} from "./model-selection.js";
collectAnthropicApiKeys,
isAnthropicRateLimitError,
} from "./live-auth-keys.js";
import { isModernModelRef } from "./live-model-filter.js";
import { getApiKeyForModel } from "./model-auth.js";
import { ensureClawdbotModelsJson } from "./models-config.js";
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
const ALL_MODELS =
process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
process.env.CLAWDBOT_LIVE_MODELS === "all";
const DIRECT_ENABLED = Boolean(process.env.CLAWDBOT_LIVE_MODELS?.trim());
const REQUIRE_PROFILE_KEYS =
process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
const describeLive = LIVE ? describe : describe.skip;
function parseProviderFilter(raw?: string): Set<string> | null {
const trimmed = raw?.trim();
@@ -46,6 +42,10 @@ function parseModelFilter(raw?: string): Set<string> | null {
return ids.length ? new Set(ids) : null;
}
function logProgress(message: string): void {
console.log(`[live] ${message}`);
}
function isGoogleModelNotFoundError(err: unknown): boolean {
const msg = String(err);
if (!/not found/i.test(msg)) return false;
@@ -127,75 +127,25 @@ async function completeOkWithRetry(params: {
return await runOnce();
}
function resolveConfiguredModelKeys(
cfg: ReturnType<typeof loadConfig>,
): string[] {
const aliasIndex = buildModelAliasIndex({
cfg,
defaultProvider: DEFAULT_PROVIDER,
});
const order: string[] = [];
const seen = new Set<string>();
const addKey = (key: string) => {
const normalized = key.trim();
if (!normalized || seen.has(normalized)) return;
seen.add(normalized);
order.push(normalized);
};
const addRef = (ref: { provider: string; model: string }) => {
addKey(`${ref.provider}/${ref.model}`);
};
addRef(
resolveConfiguredModelRef({
cfg,
defaultProvider: DEFAULT_PROVIDER,
defaultModel: DEFAULT_MODEL,
}),
);
const modelConfig = cfg.agents?.defaults?.model as
| { primary?: string; fallbacks?: string[] }
| undefined;
const imageModelConfig = cfg.agents?.defaults?.imageModel as
| { primary?: string; fallbacks?: string[] }
| undefined;
const primary = modelConfig?.primary?.trim() ?? "";
const fallbacks = modelConfig?.fallbacks ?? [];
const imagePrimary = imageModelConfig?.primary?.trim() ?? "";
const imageFallbacks = imageModelConfig?.fallbacks ?? [];
const addRaw = (raw: string) => {
const resolved = resolveModelRefFromString({
raw,
defaultProvider: DEFAULT_PROVIDER,
aliasIndex,
});
if (resolved) addRef(resolved.ref);
};
if (primary) addRaw(primary);
for (const raw of fallbacks) addRaw(String(raw ?? ""));
if (imagePrimary) addRaw(imagePrimary);
for (const raw of imageFallbacks) addRaw(String(raw ?? ""));
for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) {
const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER);
if (parsed) addRef(parsed);
}
return order;
}
describeLive("live models (profile keys)", () => {
it(
"completes across configured models",
"completes across selected models",
async () => {
const cfg = loadConfig();
await ensureClawdbotModelsJson(cfg);
if (!DIRECT_ENABLED) {
logProgress(
"[live-models] skipping (set CLAWDBOT_LIVE_MODELS=modern|all|<list>; all=modern)",
);
return;
}
const anthropicKeys = collectAnthropicApiKeys();
if (anthropicKeys.length > 0) {
process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
logProgress(
`[live-models] anthropic keys loaded: ${anthropicKeys.length}`,
);
}
const agentDir = resolveClawdbotAgentDir();
const authStorage = discoverAuthStorage(agentDir);
@@ -205,7 +155,11 @@ describeLive("live models (profile keys)", () => {
models.map((model) => [`${model.provider}/${model.id}`, model]),
);
const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
const rawModels = process.env.CLAWDBOT_LIVE_MODELS?.trim();
const useModern = rawModels === "modern" || rawModels === "all";
const useExplicit = Boolean(rawModels) && !useModern;
const filter = useExplicit ? parseModelFilter(rawModels) : null;
const allowNotFoundSkip = useModern;
const providers = parseProviderFilter(
process.env.CLAWDBOT_LIVE_PROVIDERS,
);
@@ -216,149 +170,196 @@ describeLive("live models (profile keys)", () => {
const failures: Array<{ model: string; error: string }> = [];
const skipped: Array<{ model: string; reason: string }> = [];
const candidates: Array<{
model: Model<Api>;
apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
}> = [];
const configuredKeys = resolveConfiguredModelKeys(cfg);
for (const key of configuredKeys) {
const model = modelByKey.get(key);
if (!model) {
skipped.push({
model: key,
reason: "configured model missing in registry",
});
continue;
}
for (const model of models) {
if (providers && !providers.has(model.provider)) continue;
const id = `${model.provider}/${model.id}`;
if (filter && !filter.has(id)) continue;
let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
try {
apiKeyInfo = await getApiKeyForModel({ model, cfg });
} catch (err) {
skipped.push({ model: id, reason: String(err) });
continue;
}
if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
skipped.push({
model: id,
reason: `non-profile credential source: ${apiKeyInfo.source}`,
});
continue;
}
try {
// Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
if (
model.provider === "openai" &&
model.api === "openai-responses" &&
model.id === "gpt-5.2"
) {
const noopTool = {
name: "noop",
description: "Return ok.",
parameters: Type.Object({}, { additionalProperties: false }),
};
const first = await completeSimpleWithTimeout(
model,
{
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
],
tools: [noopTool],
},
{
apiKey: apiKeyInfo.apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 128,
},
perModelTimeoutMs,
);
const toolCall = first.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("expected tool call");
}
const second = await completeSimpleWithTimeout(
model,
{
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
first,
{
role: "toolResult",
toolCallId: toolCall.id,
toolName: "noop",
content: [{ type: "text", text: "ok" }],
isError: false,
timestamp: Date.now(),
},
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey: apiKeyInfo.apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 64,
},
perModelTimeoutMs,
);
const secondText = second.content
.filter((b) => b.type === "text")
.map((b) => b.text.trim())
.join(" ");
expect(secondText.length).toBeGreaterThan(0);
if (!filter && useModern) {
if (!isModernModelRef({ provider: model.provider, id: model.id })) {
continue;
}
const ok = await completeOkWithRetry({
model,
apiKey: apiKeyInfo.apiKey,
timeoutMs: perModelTimeoutMs,
});
if (ok.res.stopReason === "error") {
const msg = ok.res.errorMessage ?? "";
if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) {
skipped.push({ model: id, reason: msg });
continue;
}
throw new Error(msg || "model returned error with no message");
}
if (ok.text.length === 0 && model.provider === "google") {
}
try {
const apiKeyInfo = await getApiKeyForModel({ model, cfg });
if (
REQUIRE_PROFILE_KEYS &&
!apiKeyInfo.source.startsWith("profile:")
) {
skipped.push({
model: id,
reason: "no text returned (likely unavailable model id)",
reason: `non-profile credential source: ${apiKeyInfo.source}`,
});
continue;
}
expect(ok.text.length).toBeGreaterThan(0);
candidates.push({ model, apiKeyInfo });
} catch (err) {
if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
skipped.push({ model: id, reason: String(err) });
continue;
skipped.push({ model: id, reason: String(err) });
}
}
if (candidates.length === 0) {
logProgress("[live-models] no API keys found; skipping");
return;
}
logProgress(
`[live-models] selection=${useExplicit ? "explicit" : "modern"}`,
);
logProgress(`[live-models] running ${candidates.length} models`);
const total = candidates.length;
for (const [index, entry] of candidates.entries()) {
const { model, apiKeyInfo } = entry;
const id = `${model.provider}/${model.id}`;
const progressLabel = `[live-models] ${index + 1}/${total} ${id}`;
const attemptMax =
model.provider === "anthropic" && anthropicKeys.length > 0
? anthropicKeys.length
: 1;
for (let attempt = 0; attempt < attemptMax; attempt += 1) {
if (model.provider === "anthropic" && anthropicKeys.length > 0) {
process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
}
const apiKey =
model.provider === "anthropic" && anthropicKeys.length > 0
? anthropicKeys[attempt]
: apiKeyInfo.apiKey;
try {
// Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
if (
model.provider === "openai" &&
model.api === "openai-responses" &&
model.id === "gpt-5.2"
) {
logProgress(`${progressLabel}: tool-only regression`);
const noopTool = {
name: "noop",
description: "Return ok.",
parameters: Type.Object({}, { additionalProperties: false }),
};
const first = await completeSimpleWithTimeout(
model,
{
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
],
tools: [noopTool],
},
{
apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 128,
},
perModelTimeoutMs,
);
const toolCall = first.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("expected tool call");
}
const second = await completeSimpleWithTimeout(
model,
{
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
first,
{
role: "toolResult",
toolCallId: toolCall.id,
toolName: "noop",
content: [{ type: "text", text: "ok" }],
isError: false,
timestamp: Date.now(),
},
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 64,
},
perModelTimeoutMs,
);
const secondText = second.content
.filter((b) => b.type === "text")
.map((b) => b.text.trim())
.join(" ");
expect(secondText.length).toBeGreaterThan(0);
logProgress(`${progressLabel}: done`);
break;
}
logProgress(`${progressLabel}: prompt`);
const ok = await completeOkWithRetry({
model,
apiKey,
timeoutMs: perModelTimeoutMs,
});
if (ok.res.stopReason === "error") {
const msg = ok.res.errorMessage ?? "";
if (allowNotFoundSkip && isModelNotFoundErrorMessage(msg)) {
skipped.push({ model: id, reason: msg });
logProgress(`${progressLabel}: skip (model not found)`);
break;
}
throw new Error(msg || "model returned error with no message");
}
if (ok.text.length === 0 && model.provider === "google") {
skipped.push({
model: id,
reason: "no text returned (likely unavailable model id)",
});
logProgress(`${progressLabel}: skip (google model not found)`);
break;
}
expect(ok.text.length).toBeGreaterThan(0);
logProgress(`${progressLabel}: done`);
break;
} catch (err) {
const message = String(err);
if (
model.provider === "anthropic" &&
isAnthropicRateLimitError(message) &&
attempt + 1 < attemptMax
) {
logProgress(`${progressLabel}: rate limit, retrying with next key`);
continue;
}
if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
skipped.push({ model: id, reason: message });
logProgress(`${progressLabel}: skip (google model not found)`);
break;
}
logProgress(`${progressLabel}: failed`);
failures.push({ model: id, error: message });
break;
}
failures.push({ model: id, error: String(err) });
}
}
@@ -372,8 +373,6 @@ describeLive("live models (profile keys)", () => {
);
}
// Keep one assertion so the test fails loudly if we somehow ran nothing.
expect(models.length).toBeGreaterThan(0);
void skipped;
},
15 * 60 * 1000,

View File

@@ -11,9 +11,15 @@ import {
} from "@mariozechner/pi-coding-agent";
import { describe, expect, it } from "vitest";
import { resolveClawdbotAgentDir } from "../agents/agent-paths.js";
import {
collectAnthropicApiKeys,
isAnthropicRateLimitError,
} from "../agents/live-auth-keys.js";
import { isModernModelRef } from "../agents/live-model-filter.js";
import { getApiKeyForModel } from "../agents/model-auth.js";
import { ensureClawdbotModelsJson } from "../agents/models-config.js";
import { loadConfig } from "../config/config.js";
import type { ClawdbotConfig, ModelProviderConfig } from "../config/types.js";
import {
GATEWAY_CLIENT_MODES,
GATEWAY_CLIENT_NAMES,
@@ -25,16 +31,14 @@ import { startGatewayServer } from "./server.js";
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
const GATEWAY_LIVE = process.env.CLAWDBOT_LIVE_GATEWAY === "1";
const ALL_MODELS =
process.env.CLAWDBOT_LIVE_GATEWAY_ALL_MODELS === "1" ||
process.env.CLAWDBOT_LIVE_GATEWAY_MODELS === "all";
const EXTRA_TOOL_PROBES = process.env.CLAWDBOT_LIVE_GATEWAY_TOOL_PROBE === "1";
const EXTRA_IMAGE_PROBES =
process.env.CLAWDBOT_LIVE_GATEWAY_IMAGE_PROBE === "1";
const ZAI_FALLBACK = process.env.CLAWDBOT_LIVE_GATEWAY_ZAI_FALLBACK === "1";
const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
const THINKING_LEVEL = "high";
const THINKING_TAG_RE =
/<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
const describeLive = LIVE && GATEWAY_LIVE ? describe : describe.skip;
const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
function parseFilter(raw?: string): Set<string> | null {
const trimmed = raw?.trim();
@@ -46,6 +50,26 @@ function parseFilter(raw?: string): Set<string> | null {
return ids.length ? new Set(ids) : null;
}
function logProgress(message: string): void {
console.log(`[live] ${message}`);
}
function assertNoReasoningTags(params: {
text: string;
model: string;
phase: string;
label: string;
}): void {
if (!params.text) return;
if (THINKING_TAG_RE.test(params.text) || FINAL_TAG_RE.test(params.text)) {
const snippet =
params.text.length > 200 ? `${params.text.slice(0, 200)}` : params.text;
throw new Error(
`[${params.label}] reasoning tag leak (${params.model} / ${params.phase}): ${snippet}`,
);
}
}
function extractPayloadText(result: unknown): string {
const record = result as Record<string, unknown>;
const payloads = Array.isArray(record.payloads) ? record.payloads : [];
@@ -200,61 +224,470 @@ async function connectClient(params: { url: string; token: string }) {
});
}
type GatewayModelSuiteParams = {
label: string;
cfg: ClawdbotConfig;
candidates: Array<Model<Api>>;
extraToolProbes: boolean;
extraImageProbes: boolean;
thinkingLevel: string;
providerOverrides?: Record<string, ModelProviderConfig>;
};
function buildLiveGatewayConfig(params: {
cfg: ClawdbotConfig;
candidates: Array<Model<Api>>;
providerOverrides?: Record<string, ModelProviderConfig>;
}): ClawdbotConfig {
const lmstudioProvider = params.cfg.models?.providers?.lmstudio;
const baseProviders = params.cfg.models?.providers ?? {};
const nextProviders = {
...baseProviders,
...(lmstudioProvider
? {
lmstudio: {
...lmstudioProvider,
api: "openai-completions",
},
}
: {}),
...(params.providerOverrides ?? {}),
};
const providers =
Object.keys(nextProviders).length > 0 ? nextProviders : baseProviders;
return {
...params.cfg,
agents: {
...params.cfg.agents,
list: (params.cfg.agents?.list ?? []).map((entry) => ({
...entry,
sandbox: { mode: "off" },
})),
defaults: {
...params.cfg.agents?.defaults,
// Live tests should avoid Docker sandboxing so tool probes can
// operate on the temporary probe files we create in the host workspace.
sandbox: { mode: "off" },
models: Object.fromEntries(
params.candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
),
},
},
models:
Object.keys(providers).length > 0
? { ...params.cfg.models, providers }
: params.cfg.models,
};
}
function buildMinimaxProviderOverride(params: {
cfg: ClawdbotConfig;
api: "openai-completions" | "anthropic-messages";
baseUrl: string;
}): ModelProviderConfig | null {
const existing = params.cfg.models?.providers?.minimax;
if (!existing || !Array.isArray(existing.models) || existing.models.length === 0)
return null;
return {
...existing,
api: params.api,
baseUrl: params.baseUrl,
};
}
async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
const previous = {
configPath: process.env.CLAWDBOT_CONFIG_PATH,
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
skipCron: process.env.CLAWDBOT_SKIP_CRON,
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
};
process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
process.env.CLAWDBOT_SKIP_CRON = "1";
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
const token = `test-${randomUUID()}`;
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
const workspaceDir = resolveUserPath(
params.cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
);
await fs.mkdir(workspaceDir, { recursive: true });
const nonceA = randomUUID();
const nonceB = randomUUID();
const toolProbePath = path.join(
workspaceDir,
`.clawdbot-live-tool-probe.${nonceA}.txt`,
);
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
const nextCfg = buildLiveGatewayConfig({
cfg: params.cfg,
candidates: params.candidates,
providerOverrides: params.providerOverrides,
});
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "clawdbot-live-"));
const tempConfigPath = path.join(tempDir, "clawdbot.json");
await fs.writeFile(tempConfigPath, `${JSON.stringify(nextCfg, null, 2)}\n`);
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
await ensureClawdbotModelsJson(nextCfg);
const port = await getFreeGatewayPort();
const server = await startGatewayServer(port, {
bind: "loopback",
auth: { mode: "token", token },
controlUiEnabled: false,
});
const client = await connectClient({
url: `ws://127.0.0.1:${port}`,
token,
});
try {
logProgress(
`[${params.label}] running ${params.candidates.length} models (thinking=${params.thinkingLevel})`,
);
const anthropicKeys = collectAnthropicApiKeys();
if (anthropicKeys.length > 0) {
process.env.ANTHROPIC_API_KEY = anthropicKeys[0];
logProgress(`[${params.label}] anthropic keys loaded: ${anthropicKeys.length}`);
}
const sessionKey = `agent:dev:${params.label}`;
const failures: Array<{ model: string; error: string }> = [];
const total = params.candidates.length;
for (const [index, model] of params.candidates.entries()) {
const modelKey = `${model.provider}/${model.id}`;
const progressLabel = `[${params.label}] ${index + 1}/${total} ${modelKey}`;
const attemptMax =
model.provider === "anthropic" && anthropicKeys.length > 0
? anthropicKeys.length
: 1;
for (let attempt = 0; attempt < attemptMax; attempt += 1) {
if (model.provider === "anthropic" && anthropicKeys.length > 0) {
process.env.ANTHROPIC_API_KEY = anthropicKeys[attempt];
}
try {
// Ensure session exists + override model for this run.
await client.request<Record<string, unknown>>("sessions.patch", {
key: sessionKey,
model: modelKey,
});
// Reset between models: avoids cross-provider transcript incompatibilities
// (notably OpenAI Responses requiring reasoning replay for function_call items).
await client.request<Record<string, unknown>>("sessions.reset", {
key: sessionKey,
});
logProgress(`${progressLabel}: prompt`);
const runId = randomUUID();
const payload = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId}`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (payload?.status !== "ok") {
throw new Error(`agent status=${String(payload?.status)}`);
}
const text = extractPayloadText(payload?.result);
if (model.provider === "google" && isGoogleModelNotFoundText(text)) {
// Catalog drift: model IDs can disappear or become unavailable on the API.
// Treat as skip when scanning "all models" for Google.
logProgress(`${progressLabel}: skip (google model not found)`);
break;
}
assertNoReasoningTags({
text,
model: modelKey,
phase: "prompt",
label: params.label,
});
if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
if (
!/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
!/\bmacro\s*-?\s*tasks?\b/i.test(text)
) {
throw new Error(`missing required keywords: ${text}`);
}
// Real tool invocation: force the agent to Read a local file and echo a nonce.
logProgress(`${progressLabel}: tool-read`);
const runIdTool = randomUUID();
const toolProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-tool`,
message:
"Clawdbot live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
"Then reply with the two nonce values you read (include both).",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (toolProbe?.status !== "ok") {
throw new Error(`tool probe failed: status=${String(toolProbe?.status)}`);
}
const toolText = extractPayloadText(toolProbe?.result);
assertNoReasoningTags({
text: toolText,
model: modelKey,
phase: "tool-read",
label: params.label,
});
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
throw new Error(`tool probe missing nonce: ${toolText}`);
}
if (params.extraToolProbes) {
logProgress(`${progressLabel}: tool-exec`);
const nonceC = randomUUID();
const toolWritePath = path.join(tempDir, `write-${runIdTool}.txt`);
const execReadProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read`,
message:
"Clawdbot live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (execReadProbe?.status !== "ok") {
throw new Error(
`exec+read probe failed: status=${String(execReadProbe?.status)}`,
);
}
const execReadText = extractPayloadText(execReadProbe?.result);
assertNoReasoningTags({
text: execReadText,
model: modelKey,
phase: "tool-exec",
label: params.label,
});
if (!execReadText.includes(nonceC)) {
throw new Error(`exec+read probe missing nonce: ${execReadText}`);
}
await fs.rm(toolWritePath, { force: true });
}
if (params.extraImageProbes && model.input?.includes("image")) {
logProgress(`${progressLabel}: image`);
const imageCode = randomImageProbeCode(10);
const imageBase64 = renderCatNoncePngBase64(imageCode);
const runIdImage = randomUUID();
const imageProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdImage}-image`,
message:
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
"(1) the animal shown or written in the image, lowercase; " +
"(2) the code printed in the image, uppercase. No extra text.",
attachments: [
{
mimeType: "image/png",
fileName: `probe-${runIdImage}.png`,
content: imageBase64,
},
],
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (imageProbe?.status !== "ok") {
throw new Error(
`image probe failed: status=${String(imageProbe?.status)}`,
);
}
const imageText = extractPayloadText(imageProbe?.result);
assertNoReasoningTags({
text: imageText,
model: modelKey,
phase: "image",
label: params.label,
});
if (!/\bcat\b/i.test(imageText)) {
throw new Error(`image probe missing 'cat': ${imageText}`);
}
const candidates =
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
const bestDistance = candidates.reduce((best, cand) => {
if (Math.abs(cand.length - imageCode.length) > 2) return best;
return Math.min(best, editDistance(cand, imageCode));
}, Number.POSITIVE_INFINITY);
if (!(bestDistance <= 2)) {
throw new Error(
`image probe missing code (${imageCode}): ${imageText}`,
);
}
}
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
if (
(model.provider === "openai" && model.api === "openai-responses") ||
(model.provider === "openai-codex" &&
model.api === "openai-codex-responses")
) {
logProgress(`${progressLabel}: tool-only regression`);
const runId2 = randomUUID();
const first = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-1`,
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (first?.status !== "ok") {
throw new Error(`tool-only turn failed: status=${String(first?.status)}`);
}
const firstText = extractPayloadText(first?.result);
assertNoReasoningTags({
text: firstText,
model: modelKey,
phase: "tool-only",
label: params.label,
});
const second = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-2`,
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (second?.status !== "ok") {
throw new Error(
`post-tool message failed: status=${String(second?.status)}`,
);
}
const reply = extractPayloadText(second?.result);
assertNoReasoningTags({
text: reply,
model: modelKey,
phase: "tool-only-followup",
label: params.label,
});
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
throw new Error(`unexpected reply: ${reply}`);
}
}
logProgress(`${progressLabel}: done`);
break;
} catch (err) {
const message = String(err);
if (
model.provider === "anthropic" &&
isAnthropicRateLimitError(message) &&
attempt + 1 < attemptMax
) {
logProgress(`${progressLabel}: rate limit, retrying with next key`);
continue;
}
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
if (
model.provider === "openai-codex" &&
isRefreshTokenReused(message)
) {
logProgress(`${progressLabel}: skip (codex refresh token reused)`);
break;
}
logProgress(`${progressLabel}: failed`);
failures.push({ model: modelKey, error: message });
break;
}
}
}
if (failures.length > 0) {
const preview = failures
.slice(0, 20)
.map((f) => `- ${f.model}: ${f.error}`)
.join("\n");
throw new Error(
`gateway live model failures (${failures.length}):\n${preview}`,
);
}
} finally {
client.stop();
await server.close({ reason: "live test complete" });
await fs.rm(toolProbePath, { force: true });
await fs.rm(tempDir, { recursive: true, force: true });
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
}
}
describeLive("gateway live (dev agent, profile keys)", () => {
it(
"runs meaningful prompts across models with available keys",
async () => {
const previous = {
configPath: process.env.CLAWDBOT_CONFIG_PATH,
token: process.env.CLAWDBOT_GATEWAY_TOKEN,
skipProviders: process.env.CLAWDBOT_SKIP_PROVIDERS,
skipGmail: process.env.CLAWDBOT_SKIP_GMAIL_WATCHER,
skipCron: process.env.CLAWDBOT_SKIP_CRON,
skipCanvas: process.env.CLAWDBOT_SKIP_CANVAS_HOST,
};
process.env.CLAWDBOT_SKIP_PROVIDERS = "1";
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = "1";
process.env.CLAWDBOT_SKIP_CRON = "1";
process.env.CLAWDBOT_SKIP_CANVAS_HOST = "1";
const token = `test-${randomUUID()}`;
process.env.CLAWDBOT_GATEWAY_TOKEN = token;
const cfg = loadConfig();
await ensureClawdbotModelsJson(cfg);
const workspaceDir = resolveUserPath(
cfg.agents?.defaults?.workspace ?? path.join(os.homedir(), "clawd"),
);
await fs.mkdir(workspaceDir, { recursive: true });
const nonceA = randomUUID();
const nonceB = randomUUID();
const toolProbePath = path.join(
workspaceDir,
`.clawdbot-live-tool-probe.${nonceA}.txt`,
);
await fs.writeFile(toolProbePath, `nonceA=${nonceA}\nnonceB=${nonceB}\n`);
const agentDir = resolveClawdbotAgentDir();
const authStorage = discoverAuthStorage(agentDir);
const modelRegistry = discoverModels(authStorage, agentDir);
const all = modelRegistry.getAll() as Array<Model<Api>>;
const filter = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_MODELS);
// Default: honor user allowlist. Opt-in: scan all models with keys.
const allowlistKeys = Object.keys(cfg.agents?.defaults?.models ?? {});
const wanted =
ALL_MODELS || allowlistKeys.length === 0
? all
: all.filter((m) => allowlistKeys.includes(`${m.provider}/${m.id}`));
const rawModels = process.env.CLAWDBOT_LIVE_GATEWAY_MODELS?.trim();
const useModern =
!rawModels || rawModels === "modern" || rawModels === "all";
const useExplicit = Boolean(rawModels) && !useModern;
const filter = useExplicit ? parseFilter(rawModels) : null;
const wanted = filter
? all.filter((m) => filter.has(`${m.provider}/${m.id}`))
: all.filter((m) =>
isModernModelRef({ provider: m.provider, id: m.id }),
);
const candidates: Array<Model<Api>> = [];
for (const model of wanted) {
const id = `${model.provider}/${model.id}`;
if (PROVIDERS && !PROVIDERS.has(model.provider)) continue;
if (filter && !filter.has(id)) continue;
try {
// eslint-disable-next-line no-await-in-loop
await getApiKeyForModel({ model, cfg });
@@ -264,315 +697,72 @@ describeLive("gateway live (dev agent, profile keys)", () => {
}
}
expect(candidates.length).toBeGreaterThan(0);
const imageCandidates = EXTRA_IMAGE_PROBES
? candidates.filter((m) => m.input?.includes("image"))
: [];
if (EXTRA_IMAGE_PROBES && imageCandidates.length === 0) {
throw new Error(
"image probe enabled but no selected models advertise image support; set CLAWDBOT_LIVE_GATEWAY_MODELS to include an image-capable model",
if (candidates.length === 0) {
logProgress("[all-models] no API keys found; skipping");
return;
}
logProgress(
`[all-models] selection=${useExplicit ? "explicit" : "modern"}`,
);
const imageCandidates = candidates.filter((m) =>
m.input?.includes("image"),
);
if (imageCandidates.length === 0) {
logProgress(
"[all-models] no image-capable models selected; image probe will be skipped",
);
}
// Build a temp config that allows all selected models, so session overrides stick.
const lmstudioProvider = cfg.models?.providers?.lmstudio;
const nextCfg = {
...cfg,
agents: {
...cfg.agents,
list: (cfg.agents?.list ?? []).map((entry) => ({
...entry,
sandbox: { mode: "off" },
})),
defaults: {
...cfg.agents?.defaults,
// Live tests should avoid Docker sandboxing so tool probes can
// operate on the temporary probe files we create in the host workspace.
sandbox: { mode: "off" },
models: Object.fromEntries(
candidates.map((m) => [`${m.provider}/${m.id}`, {}]),
),
},
},
models: {
...cfg.models,
providers: {
...cfg.models?.providers,
// LM Studio is most reliable via Chat Completions; its Responses API
// tool-calling behavior is inconsistent across releases.
...(lmstudioProvider
? {
lmstudio: {
...lmstudioProvider,
api: "openai-completions",
},
}
: {}),
},
},
};
const tempDir = await fs.mkdtemp(
path.join(os.tmpdir(), "clawdbot-live-"),
);
const tempConfigPath = path.join(tempDir, "clawdbot.json");
await fs.writeFile(
tempConfigPath,
`${JSON.stringify(nextCfg, null, 2)}\n`,
);
process.env.CLAWDBOT_CONFIG_PATH = tempConfigPath;
const port = await getFreeGatewayPort();
const server = await startGatewayServer(port, {
bind: "loopback",
auth: { mode: "token", token },
controlUiEnabled: false,
await runGatewayModelSuite({
label: "all-models",
cfg,
candidates,
extraToolProbes: true,
extraImageProbes: true,
thinkingLevel: THINKING_LEVEL,
});
const client = await connectClient({
url: `ws://127.0.0.1:${port}`,
token,
const minimaxCandidates = candidates.filter((model) => model.provider === "minimax");
if (minimaxCandidates.length === 0) {
logProgress("[minimax] no candidates with keys; skipping dual endpoint probes");
return;
}
const minimaxOpenAi = buildMinimaxProviderOverride({
cfg,
api: "openai-completions",
baseUrl: "https://api.minimax.io/v1",
});
if (minimaxOpenAi) {
await runGatewayModelSuite({
label: "minimax-openai",
cfg,
candidates: minimaxCandidates,
extraToolProbes: true,
extraImageProbes: true,
thinkingLevel: THINKING_LEVEL,
providerOverrides: { minimax: minimaxOpenAi },
});
} else {
logProgress("[minimax-openai] missing minimax provider config; skipping");
}
try {
const sessionKey = "agent:dev:live-gateway";
const failures: Array<{ model: string; error: string }> = [];
for (const model of candidates) {
const modelKey = `${model.provider}/${model.id}`;
try {
// Ensure session exists + override model for this run.
await client.request<Record<string, unknown>>("sessions.patch", {
key: sessionKey,
model: modelKey,
});
// Reset between models: avoids cross-provider transcript incompatibilities
// (notably OpenAI Responses requiring reasoning replay for function_call items).
await client.request<Record<string, unknown>>("sessions.reset", {
key: sessionKey,
});
// “Meaningful” direct prompt (no tools).
const runId = randomUUID();
const payload = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId}`,
message:
"Explain in 2-3 sentences how the JavaScript event loop handles microtasks vs macrotasks. Must mention both words: microtask and macrotask.",
deliver: false,
},
{ expectFinal: true },
);
if (payload?.status !== "ok") {
throw new Error(`agent status=${String(payload?.status)}`);
}
const text = extractPayloadText(payload?.result);
if (
model.provider === "google" &&
isGoogleModelNotFoundText(text)
) {
// Catalog drift: model IDs can disappear or become unavailable on the API.
// Treat as skip when scanning "all models" for Google.
continue;
}
if (!isMeaningful(text)) throw new Error(`not meaningful: ${text}`);
if (
!/\bmicro\s*-?\s*tasks?\b/i.test(text) ||
!/\bmacro\s*-?\s*tasks?\b/i.test(text)
) {
throw new Error(`missing required keywords: ${text}`);
}
// Real tool invocation: force the agent to Read a local file and echo a nonce.
const runIdTool = randomUUID();
const toolProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-tool`,
message:
"Clawdbot live tool probe (local, safe): " +
`use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolProbePath}"}. ` +
"Then reply with the two nonce values you read (include both).",
deliver: false,
},
{ expectFinal: true },
);
if (toolProbe?.status !== "ok") {
throw new Error(
`tool probe failed: status=${String(toolProbe?.status)}`,
);
}
const toolText = extractPayloadText(toolProbe?.result);
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
throw new Error(`tool probe missing nonce: ${toolText}`);
}
if (EXTRA_TOOL_PROBES) {
const nonceC = randomUUID();
const toolWritePath = path.join(
tempDir,
`write-${runIdTool}.txt`,
);
const execReadProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdTool}-exec-read`,
message:
"Clawdbot live tool probe (local, safe): " +
"use the tool named `exec` (or `Exec`) to run this command: " +
`mkdir -p "${tempDir}" && printf '%s' '${nonceC}' > "${toolWritePath}". ` +
`Then use the tool named \`read\` (or \`Read\`) with JSON arguments {"path":"${toolWritePath}"}. ` +
"Finally reply including the nonce text you read back.",
deliver: false,
},
{ expectFinal: true },
);
if (execReadProbe?.status !== "ok") {
throw new Error(
`exec+read probe failed: status=${String(execReadProbe?.status)}`,
);
}
const execReadText = extractPayloadText(execReadProbe?.result);
if (!execReadText.includes(nonceC)) {
throw new Error(
`exec+read probe missing nonce: ${execReadText}`,
);
}
await fs.rm(toolWritePath, { force: true });
}
if (EXTRA_IMAGE_PROBES && model.input?.includes("image")) {
const imageCode = randomImageProbeCode(10);
const imageBase64 = renderCatNoncePngBase64(imageCode);
const runIdImage = randomUUID();
const imageProbe = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runIdImage}-image`,
message:
"Look at the attached image. Reply with exactly two tokens separated by a single space: " +
"(1) the animal shown or written in the image, lowercase; " +
"(2) the code printed in the image, uppercase. No extra text.",
attachments: [
{
mimeType: "image/png",
fileName: `probe-${runIdImage}.png`,
content: imageBase64,
},
],
deliver: false,
},
{ expectFinal: true },
);
if (imageProbe?.status !== "ok") {
throw new Error(
`image probe failed: status=${String(imageProbe?.status)}`,
);
}
const imageText = extractPayloadText(imageProbe?.result);
if (!/\bcat\b/i.test(imageText)) {
throw new Error(`image probe missing 'cat': ${imageText}`);
}
const candidates =
imageText.toUpperCase().match(/[A-Z0-9]{6,20}/g) ?? [];
const bestDistance = candidates.reduce((best, cand) => {
if (Math.abs(cand.length - imageCode.length) > 2) return best;
return Math.min(best, editDistance(cand, imageCode));
}, Number.POSITIVE_INFINITY);
if (!(bestDistance <= 2)) {
throw new Error(
`image probe missing code (${imageCode}): ${imageText}`,
);
}
}
// Regression: tool-call-only turn followed by a user message (OpenAI responses bug class).
if (
(model.provider === "openai" &&
model.api === "openai-responses") ||
(model.provider === "openai-codex" &&
model.api === "openai-codex-responses")
) {
const runId2 = randomUUID();
const first = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-1`,
message: `Call the tool named \`read\` (or \`Read\`) on "${toolProbePath}". Do not write any other text.`,
deliver: false,
},
{ expectFinal: true },
);
if (first?.status !== "ok") {
throw new Error(
`tool-only turn failed: status=${String(first?.status)}`,
);
}
const second = await client.request<AgentFinalPayload>(
"agent",
{
sessionKey,
idempotencyKey: `idem-${runId2}-2`,
message: `Now answer: what are the values of nonceA and nonceB in "${toolProbePath}"? Reply with exactly: ${nonceA} ${nonceB}.`,
deliver: false,
},
{ expectFinal: true },
);
if (second?.status !== "ok") {
throw new Error(
`post-tool message failed: status=${String(second?.status)}`,
);
}
const reply = extractPayloadText(second?.result);
if (!reply.includes(nonceA) || !reply.includes(nonceB)) {
throw new Error(`unexpected reply: ${reply}`);
}
}
} catch (err) {
const message = String(err);
// OpenAI Codex refresh tokens can become single-use; skip instead of failing all live tests.
if (
model.provider === "openai-codex" &&
isRefreshTokenReused(message)
) {
continue;
}
failures.push({ model: modelKey, error: message });
}
}
if (failures.length > 0) {
const preview = failures
.slice(0, 20)
.map((f) => `- ${f.model}: ${f.error}`)
.join("\n");
throw new Error(
`gateway live model failures (${failures.length}):\n${preview}`,
);
}
} finally {
client.stop();
await server.close({ reason: "live test complete" });
await fs.rm(toolProbePath, { force: true });
await fs.rm(tempDir, { recursive: true, force: true });
process.env.CLAWDBOT_CONFIG_PATH = previous.configPath;
process.env.CLAWDBOT_GATEWAY_TOKEN = previous.token;
process.env.CLAWDBOT_SKIP_PROVIDERS = previous.skipProviders;
process.env.CLAWDBOT_SKIP_GMAIL_WATCHER = previous.skipGmail;
process.env.CLAWDBOT_SKIP_CRON = previous.skipCron;
process.env.CLAWDBOT_SKIP_CANVAS_HOST = previous.skipCanvas;
const minimaxAnthropic = buildMinimaxProviderOverride({
cfg,
api: "anthropic-messages",
baseUrl: "https://api.minimax.io/anthropic",
});
if (minimaxAnthropic) {
await runGatewayModelSuite({
label: "minimax-anthropic",
cfg,
candidates: minimaxCandidates,
extraToolProbes: true,
extraImageProbes: true,
thinkingLevel: THINKING_LEVEL,
providerOverrides: { minimax: minimaxAnthropic },
});
} else {
logProgress("[minimax-anthropic] missing minimax provider config; skipping");
}
},
20 * 60 * 1000,
@@ -661,6 +851,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
message:
`Call the tool named \`read\` (or \`Read\` if \`read\` is unavailable) with JSON arguments {"path":"${toolProbePath}"}. ` +
`Then reply with exactly: ${nonceA} ${nonceB}. No extra text.`,
thinking: THINKING_LEVEL,
deliver: false,
},
{ expectFinal: true },
@@ -671,6 +862,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
);
}
const toolText = extractPayloadText(toolProbe?.result);
assertNoReasoningTags({
text: toolText,
model: "anthropic/claude-opus-4-5",
phase: "zai-fallback-tool",
label: "zai-fallback",
});
if (!toolText.includes(nonceA) || !toolText.includes(nonceB)) {
throw new Error(`anthropic tool probe missing nonce: ${toolText}`);
}
@@ -689,6 +886,7 @@ describeLive("gateway live (dev agent, profile keys)", () => {
message:
`What are the values of nonceA and nonceB in "${toolProbePath}"? ` +
`Reply with exactly: ${nonceA} ${nonceB}.`,
thinking: THINKING_LEVEL,
deliver: false,
},
{ expectFinal: true },
@@ -699,6 +897,12 @@ describeLive("gateway live (dev agent, profile keys)", () => {
);
}
const followupText = extractPayloadText(followup?.result);
assertNoReasoningTags({
text: followupText,
model: "zai/glm-4.7",
phase: "zai-fallback-followup",
label: "zai-fallback",
});
if (!followupText.includes(nonceA) || !followupText.includes(nonceB)) {
throw new Error(`zai followup missing nonce: ${followupText}`);
}