fix(testing): stabilize live model runs

This commit is contained in:
Peter Steinberger
2026-01-11 04:21:24 +00:00
parent 3b6739d3e9
commit f00038b383
4 changed files with 169 additions and 47 deletions

View File

@@ -27,19 +27,3 @@ index 188a8294f26fe1bfe3fb298a7f58e4d8eaf2a529..a3aeb6a7ff53bc4f7f44362adb950b2c
}));
}
function mapStopReason(status) {
diff --git a/dist/providers/openai-responses.js b/dist/providers/openai-responses.js
index 7b58a79c989abc76bb8fc9e99fb49126e5fd7de4..a1a7f35ad47975dc1268d1a0c2078b0b651e97b4 100644
--- a/dist/providers/openai-responses.js
+++ b/dist/providers/openai-responses.js
@@ -396,9 +396,10 @@ function convertMessages(model, context) {
}
else if (msg.role === "assistant") {
const output = [];
+ const hasAssistantText = msg.content.some((block) => block.type === "text");
for (const block of msg.content) {
// Do not submit thinking blocks if the completion had an error (i.e. abort)
- if (block.type === "thinking" && msg.stopReason !== "error") {
+ if (block.type === "thinking" && msg.stopReason !== "error" && hasAssistantText) {
if (block.thinkingSignature) {
const reasoningItem = JSON.parse(block.thinkingSignature);
output.push(reasoningItem);

View File

@@ -7,7 +7,14 @@ import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
import { loadConfig } from "../config/config.js";
import { resolveClawdbotAgentDir } from "./agent-paths.js";
import { DEFAULT_MODEL, DEFAULT_PROVIDER } from "./defaults.js";
import { getApiKeyForModel } from "./model-auth.js";
import {
buildModelAliasIndex,
parseModelRef,
resolveConfiguredModelRef,
resolveModelRefFromString,
} from "./model-selection.js";
import { ensureClawdbotModelsJson } from "./models-config.js";
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
@@ -58,6 +65,131 @@ function isModelNotFoundErrorMessage(raw: string): boolean {
return false;
}
function toInt(value: string | undefined, fallback: number): number {
const trimmed = value?.trim();
if (!trimmed) return fallback;
const parsed = Number.parseInt(trimmed, 10);
return Number.isFinite(parsed) ? parsed : fallback;
}
async function completeSimpleWithTimeout<TApi extends Api>(
model: Model<TApi>,
context: Parameters<typeof completeSimple<TApi>>[1],
options: Parameters<typeof completeSimple<TApi>>[2],
timeoutMs: number,
) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), Math.max(1, timeoutMs));
timer.unref?.();
try {
return await completeSimple(model, context, {
...options,
signal: controller.signal,
});
} finally {
clearTimeout(timer);
}
}
async function completeOkWithRetry(params: {
model: Model<Api>;
apiKey: string;
timeoutMs: number;
}) {
const runOnce = async () => {
const res = await completeSimpleWithTimeout(
params.model,
{
messages: [
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey: params.apiKey,
reasoning: params.model.reasoning ? "low" : undefined,
maxTokens: 64,
},
params.timeoutMs,
);
const text = res.content
.filter((block) => block.type === "text")
.map((block) => block.text.trim())
.join(" ");
return { res, text };
};
const first = await runOnce();
if (first.text.length > 0) return first;
return await runOnce();
}
function resolveConfiguredModelKeys(
cfg: ReturnType<typeof loadConfig>,
): string[] {
const aliasIndex = buildModelAliasIndex({
cfg,
defaultProvider: DEFAULT_PROVIDER,
});
const order: string[] = [];
const seen = new Set<string>();
const addKey = (key: string) => {
const normalized = key.trim();
if (!normalized || seen.has(normalized)) return;
seen.add(normalized);
order.push(normalized);
};
const addRef = (ref: { provider: string; model: string }) => {
addKey(`${ref.provider}/${ref.model}`);
};
addRef(
resolveConfiguredModelRef({
cfg,
defaultProvider: DEFAULT_PROVIDER,
defaultModel: DEFAULT_MODEL,
}),
);
const modelConfig = cfg.agents?.defaults?.model as
| { primary?: string; fallbacks?: string[] }
| undefined;
const imageModelConfig = cfg.agents?.defaults?.imageModel as
| { primary?: string; fallbacks?: string[] }
| undefined;
const primary = modelConfig?.primary?.trim() ?? "";
const fallbacks = modelConfig?.fallbacks ?? [];
const imagePrimary = imageModelConfig?.primary?.trim() ?? "";
const imageFallbacks = imageModelConfig?.fallbacks ?? [];
const addRaw = (raw: string) => {
const resolved = resolveModelRefFromString({
raw,
defaultProvider: DEFAULT_PROVIDER,
aliasIndex,
});
if (resolved) addRef(resolved.ref);
};
if (primary) addRaw(primary);
for (const raw of fallbacks) addRaw(String(raw ?? ""));
if (imagePrimary) addRaw(imagePrimary);
for (const raw of imageFallbacks) addRaw(String(raw ?? ""));
for (const key of Object.keys(cfg.agents?.defaults?.models ?? {})) {
const parsed = parseModelRef(String(key ?? ""), DEFAULT_PROVIDER);
if (parsed) addRef(parsed);
}
return order;
}
describeLive("live models (profile keys)", () => {
it(
"completes across configured models",
@@ -69,16 +201,33 @@ describeLive("live models (profile keys)", () => {
const authStorage = discoverAuthStorage(agentDir);
const modelRegistry = discoverModels(authStorage, agentDir);
const models = modelRegistry.getAll() as Array<Model<Api>>;
const modelByKey = new Map(
models.map((model) => [`${model.provider}/${model.id}`, model]),
);
const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
const providers = parseProviderFilter(
process.env.CLAWDBOT_LIVE_PROVIDERS,
);
const perModelTimeoutMs = toInt(
process.env.CLAWDBOT_LIVE_MODEL_TIMEOUT_MS,
30_000,
);
const failures: Array<{ model: string; error: string }> = [];
const skipped: Array<{ model: string; reason: string }> = [];
for (const model of models) {
const configuredKeys = resolveConfiguredModelKeys(cfg);
for (const key of configuredKeys) {
const model = modelByKey.get(key);
if (!model) {
skipped.push({
model: key,
reason: "configured model missing in registry",
});
continue;
}
if (providers && !providers.has(model.provider)) continue;
const id = `${model.provider}/${model.id}`;
if (filter && !filter.has(id)) continue;
@@ -100,7 +249,7 @@ describeLive("live models (profile keys)", () => {
}
try {
// Special regression: OpenAI rejects replayed `reasoning` items for tool-only turns.
// Special regression: OpenAI requires replayed `reasoning` items for tool-only turns.
if (
model.provider === "openai" &&
model.api === "openai-responses" &&
@@ -112,7 +261,7 @@ describeLive("live models (profile keys)", () => {
parameters: Type.Object({}, { additionalProperties: false }),
};
const first = await completeSimple(
const first = await completeSimpleWithTimeout(
model,
{
messages: [
@@ -130,6 +279,7 @@ describeLive("live models (profile keys)", () => {
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 128,
},
perModelTimeoutMs,
);
const toolCall = first.content.find((b) => b.type === "toolCall");
@@ -138,7 +288,7 @@ describeLive("live models (profile keys)", () => {
throw new Error("expected tool call");
}
const second = await completeSimple(
const second = await completeSimpleWithTimeout(
model,
{
messages: [
@@ -169,6 +319,7 @@ describeLive("live models (profile keys)", () => {
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 64,
},
perModelTimeoutMs,
);
const secondText = second.content
@@ -179,26 +330,14 @@ describeLive("live models (profile keys)", () => {
continue;
}
const res = await completeSimple(
const ok = await completeOkWithRetry({
model,
{
messages: [
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey: apiKeyInfo.apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 64,
},
);
apiKey: apiKeyInfo.apiKey,
timeoutMs: perModelTimeoutMs,
});
if (res.stopReason === "error") {
const msg = res.errorMessage ?? "";
if (ok.res.stopReason === "error") {
const msg = ok.res.errorMessage ?? "";
if (ALL_MODELS && isModelNotFoundErrorMessage(msg)) {
skipped.push({ model: id, reason: msg });
continue;
@@ -206,18 +345,14 @@ describeLive("live models (profile keys)", () => {
throw new Error(msg || "model returned error with no message");
}
const text = res.content
.filter((block) => block.type === "text")
.map((block) => block.text.trim())
.join(" ");
if (text.length === 0 && model.provider === "google") {
if (ok.text.length === 0 && model.provider === "google") {
skipped.push({
model: id,
reason: "no text returned (likely unavailable model id)",
});
continue;
}
expect(text.length).toBeGreaterThan(0);
expect(ok.text.length).toBeGreaterThan(0);
} catch (err) {
if (model.provider === "google" && isGoogleModelNotFoundError(err)) {
skipped.push({ model: id, reason: String(err) });

View File

@@ -52,7 +52,7 @@ function installFailingFetchCapture() {
}
describe("openai-responses reasoning replay", () => {
it("skips reasoning for tool-call-only turns (OpenAI rejects standalone reasoning)", async () => {
it("replays reasoning for tool-call-only turns (OpenAI requires it)", async () => {
const cap = installFailingFetchCapture();
try {
const model = buildModel();
@@ -141,8 +141,11 @@ describe("openai-responses reasoning replay", () => {
)
.filter((t): t is string => typeof t === "string");
expect(types).toContain("reasoning");
expect(types).toContain("function_call");
expect(types).not.toContain("reasoning");
expect(types.indexOf("reasoning")).toBeLessThan(
types.indexOf("function_call"),
);
} finally {
cap.restore();
}

View File

@@ -290,7 +290,7 @@ export function noteSandboxScopeWarnings(cfg: ClawdbotConfig) {
warnings.push(
`- agents.list (id "${agentId}") sandbox ${overrides.join(
"/",
)} overrides ignored (scope resolves to "shared").`,
)} overrides ignored\n scope resolves to "shared".`,
);
}