From 2887376646157d2baac897f4b5fba00b7d25d245 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 13 Jan 2026 07:51:24 +0000
Subject: [PATCH] test: stabilize gpt-5.2 tool-only live check

---
 src/agents/models.profiles.live.test.ts | 85 +++++++++++++++++--------
 1 file changed, 58 insertions(+), 27 deletions(-)

diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts
index e991b0818..904a4bc0c 100644
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@@ -253,19 +253,17 @@ describeLive("live models (profile keys)", () => {
                 parameters: Type.Object({}, { additionalProperties: false }),
               };
 
-              const first = await completeSimpleWithTimeout(
+              let firstUserContent =
+                "Call the tool `noop` with {}. Do not write any other text.";
+              let firstUser = {
+                role: "user" as const,
+                content: firstUserContent,
+                timestamp: Date.now(),
+              };
+
+              let first = await completeSimpleWithTimeout(
                 model,
-                {
-                  messages: [
-                    {
-                      role: "user",
-                      content:
-                        "Call the tool `noop` with {}. Do not write any other text.",
-                      timestamp: Date.now(),
-                    },
-                  ],
-                  tools: [noopTool],
-                },
+                { messages: [firstUser], tools: [noopTool] },
                 {
                   apiKey,
                   reasoning: resolveTestReasoning(model),
@@ -274,8 +272,45 @@ describeLive("live models (profile keys)", () => {
                 perModelTimeoutMs,
               );
 
-              const toolCall = first.content.find((b) => b.type === "toolCall");
+              let toolCall = first.content.find((b) => b.type === "toolCall");
+              let firstText = first.content
+                .filter((b) => b.type === "text")
+                .map((b) => b.text.trim())
+                .join(" ")
+                .trim();
+
+              // Occasional flake: model answers in text instead of tool call (or adds text).
+              // Retry a couple times with a stronger instruction so we still exercise the tool-only replay path.
+              for (let i = 0; i < 2 && (!toolCall || firstText.length > 0); i += 1) {
+                firstUserContent =
+                  "Call the tool `noop` with {}. IMPORTANT: respond ONLY with the tool call; no other text.";
+                firstUser = {
+                  role: "user" as const,
+                  content: firstUserContent,
+                  timestamp: Date.now(),
+                };
+
+                first = await completeSimpleWithTimeout(
+                  model,
+                  { messages: [firstUser], tools: [noopTool] },
+                  {
+                    apiKey,
+                    reasoning: resolveTestReasoning(model),
+                    maxTokens: 128,
+                  },
+                  perModelTimeoutMs,
+                );
+
+                toolCall = first.content.find((b) => b.type === "toolCall");
+                firstText = first.content
+                  .filter((b) => b.type === "text")
+                  .map((b) => b.text.trim())
+                  .join(" ")
+                  .trim();
+              }
+
               expect(toolCall).toBeTruthy();
+              expect(firstText.length).toBe(0);
               if (!toolCall || toolCall.type !== "toolCall") {
                 throw new Error("expected tool call");
               }
@@ -284,12 +319,7 @@ describeLive("live models (profile keys)", () => {
                 model,
                 {
                   messages: [
-                    {
-                      role: "user",
-                      content:
-                        "Call the tool `noop` with {}. Do not write any other text.",
-                      timestamp: Date.now(),
-                    },
+                    firstUser,
                     first,
                     {
                       role: "toolResult",
@@ -305,14 +335,15 @@ describeLive("live models (profile keys)", () => {
                       timestamp: Date.now(),
                     },
                   ],
-                },
-                {
-                  apiKey,
-                  reasoning: resolveTestReasoning(model),
-                  maxTokens: 64,
-                },
-                perModelTimeoutMs,
-              );
+                  },
+                  {
+                    apiKey,
+                    reasoning: resolveTestReasoning(model),
+                    // Headroom: reasoning summary can consume most of the output budget.
+                    maxTokens: 256,
+                  },
+                  perModelTimeoutMs,
+                );
 
               const secondText = second.content
                 .filter((b) => b.type === "text")