From 2887376646157d2baac897f4b5fba00b7d25d245 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 13 Jan 2026 07:51:24 +0000 Subject: [PATCH] test: stabilize gpt-5.2 tool-only live check --- src/agents/models.profiles.live.test.ts | 85 +++++++++++++++++-------- 1 file changed, 58 insertions(+), 27 deletions(-) diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts index e991b0818..904a4bc0c 100644 --- a/src/agents/models.profiles.live.test.ts +++ b/src/agents/models.profiles.live.test.ts @@ -253,19 +253,17 @@ describeLive("live models (profile keys)", () => { parameters: Type.Object({}, { additionalProperties: false }), }; - const first = await completeSimpleWithTimeout( + let firstUserContent = + "Call the tool `noop` with {}. Do not write any other text."; + let firstUser = { + role: "user" as const, + content: firstUserContent, + timestamp: Date.now(), + }; + + let first = await completeSimpleWithTimeout( model, - { - messages: [ - { - role: "user", - content: - "Call the tool `noop` with {}. Do not write any other text.", - timestamp: Date.now(), - }, - ], - tools: [noopTool], - }, + { messages: [firstUser], tools: [noopTool] }, { apiKey, reasoning: resolveTestReasoning(model), @@ -274,8 +272,45 @@ describeLive("live models (profile keys)", () => { perModelTimeoutMs, ); - const toolCall = first.content.find((b) => b.type === "toolCall"); + let toolCall = first.content.find((b) => b.type === "toolCall"); + let firstText = first.content + .filter((b) => b.type === "text") + .map((b) => b.text.trim()) + .join(" ") + .trim(); + + // Occasional flake: model answers in text instead of tool call (or adds text). + // Retry a couple times with a stronger instruction so we still exercise the tool-only replay path. + for (let i = 0; i < 2 && (!toolCall || firstText.length > 0); i += 1) { + firstUserContent = + "Call the tool `noop` with {}. IMPORTANT: respond ONLY with the tool call; no other text."; + firstUser = { + role: "user" as const, + content: firstUserContent, + timestamp: Date.now(), + }; + + first = await completeSimpleWithTimeout( + model, + { messages: [firstUser], tools: [noopTool] }, + { + apiKey, + reasoning: resolveTestReasoning(model), + maxTokens: 128, + }, + perModelTimeoutMs, + ); + + toolCall = first.content.find((b) => b.type === "toolCall"); + firstText = first.content + .filter((b) => b.type === "text") + .map((b) => b.text.trim()) + .join(" ") + .trim(); + } + expect(toolCall).toBeTruthy(); + expect(firstText.length).toBe(0); if (!toolCall || toolCall.type !== "toolCall") { throw new Error("expected tool call"); } @@ -284,12 +319,7 @@ describeLive("live models (profile keys)", () => { model, { messages: [ - { - role: "user", - content: - "Call the tool `noop` with {}. Do not write any other text.", - timestamp: Date.now(), - }, + firstUser, first, { role: "toolResult", @@ -305,14 +335,15 @@ describeLive("live models (profile keys)", () => { timestamp: Date.now(), }, ], - }, - { - apiKey, - reasoning: resolveTestReasoning(model), - maxTokens: 64, - }, - perModelTimeoutMs, - ); + }, + { + apiKey, + reasoning: resolveTestReasoning(model), + // Headroom: reasoning summary can consume most of the output budget. + maxTokens: 256, + }, + perModelTimeoutMs, + ); const secondText = second.content .filter((b) => b.type === "text")