test: stabilize gpt-5.2 tool-only live check

This commit is contained in:
Peter Steinberger
2026-01-13 07:51:24 +00:00
parent e48d452c63
commit 2887376646

View File

@@ -253,19 +253,17 @@ describeLive("live models (profile keys)", () => {
parameters: Type.Object({}, { additionalProperties: false }), parameters: Type.Object({}, { additionalProperties: false }),
}; };
const first = await completeSimpleWithTimeout( let firstUserContent =
"Call the tool `noop` with {}. Do not write any other text.";
let firstUser = {
role: "user" as const,
content: firstUserContent,
timestamp: Date.now(),
};
let first = await completeSimpleWithTimeout(
model, model,
{ { messages: [firstUser], tools: [noopTool] },
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
],
tools: [noopTool],
},
{ {
apiKey, apiKey,
reasoning: resolveTestReasoning(model), reasoning: resolveTestReasoning(model),
@@ -274,8 +272,45 @@ describeLive("live models (profile keys)", () => {
perModelTimeoutMs, perModelTimeoutMs,
); );
const toolCall = first.content.find((b) => b.type === "toolCall"); let toolCall = first.content.find((b) => b.type === "toolCall");
let firstText = first.content
.filter((b) => b.type === "text")
.map((b) => b.text.trim())
.join(" ")
.trim();
// Occasional flake: model answers in text instead of tool call (or adds text).
// Retry a couple times with a stronger instruction so we still exercise the tool-only replay path.
for (let i = 0; i < 2 && (!toolCall || firstText.length > 0); i += 1) {
firstUserContent =
"Call the tool `noop` with {}. IMPORTANT: respond ONLY with the tool call; no other text.";
firstUser = {
role: "user" as const,
content: firstUserContent,
timestamp: Date.now(),
};
first = await completeSimpleWithTimeout(
model,
{ messages: [firstUser], tools: [noopTool] },
{
apiKey,
reasoning: resolveTestReasoning(model),
maxTokens: 128,
},
perModelTimeoutMs,
);
toolCall = first.content.find((b) => b.type === "toolCall");
firstText = first.content
.filter((b) => b.type === "text")
.map((b) => b.text.trim())
.join(" ")
.trim();
}
expect(toolCall).toBeTruthy(); expect(toolCall).toBeTruthy();
expect(firstText.length).toBe(0);
if (!toolCall || toolCall.type !== "toolCall") { if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("expected tool call"); throw new Error("expected tool call");
} }
@@ -284,12 +319,7 @@ describeLive("live models (profile keys)", () => {
model, model,
{ {
messages: [ messages: [
{ firstUser,
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
first, first,
{ {
role: "toolResult", role: "toolResult",
@@ -305,14 +335,15 @@ describeLive("live models (profile keys)", () => {
timestamp: Date.now(), timestamp: Date.now(),
}, },
], ],
}, },
{ {
apiKey, apiKey,
reasoning: resolveTestReasoning(model), reasoning: resolveTestReasoning(model),
maxTokens: 64, // Headroom: reasoning summary can consume most of the output budget.
}, maxTokens: 256,
perModelTimeoutMs, },
); perModelTimeoutMs,
);
const secondText = second.content const secondText = second.content
.filter((b) => b.type === "text") .filter((b) => b.type === "text")