fix(openai): avoid invalid reasoning replay

2026-01-10 00:45:10 +00:00
parent 626b085c85
commit cb10682d3e
8 changed files with 495 additions and 6 deletions
--- a/patches/@mariozechner__pi-ai.patch
+++ b/patches/@mariozechner__pi-ai.patch
@@ -12,6 +12,53 @@ index 0000000..1111111 100644
 +                        console.log(`[pi-ai] 429 rate limit - failing fast to rotate account`);
 +                        throw new Error(`Cloud Code Assist API error (${response.status}): ${errorText}`);
 +                    }
-                     // Check if retryable
-                     if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) {
-                         // Use server-provided delay or exponential backoff
+                    // Check if retryable
+                    if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) {
+                        // Use server-provided delay or exponential backoff
+
+diff --git a/dist/providers/openai-responses.js b/dist/providers/openai-responses.js
+index 0000000..1111111 100644
+--- a/dist/providers/openai-responses.js
+++ b/dist/providers/openai-responses.js
+@@ -397,9 +397,17 @@ function convertMessages(model, context) {
+         }
+         else if (msg.role === "assistant") {
+             const output = [];
+            // OpenAI Responses rejects `reasoning` items that are not followed by a `message`.
+            // Tool-call-only turns (thinking + function_call) are valid assistant turns, but
+            // their stored reasoning items must not be replayed as standalone `reasoning` input.
+            const hasTextBlock = msg.content.some((b) => b.type === "text");
+             for (const block of msg.content) {
+                 // Do not submit thinking blocks if the completion had an error (i.e. abort)
+                 if (block.type === "thinking" && msg.stopReason !== "error") {
+                     if (block.thinkingSignature) {
+                        if (!hasTextBlock)
+                            continue;
+                         const reasoningItem = JSON.parse(block.thinkingSignature);
+                         output.push(reasoningItem);
+                     }
+                 }
+                 else if (block.type === "text") {
+
+diff --git a/dist/providers/openai-codex-responses.js b/dist/providers/openai-codex-responses.js
+index 0000000..1111111 100644
+--- a/dist/providers/openai-codex-responses.js
+++ b/dist/providers/openai-codex-responses.js
+@@ -434,9 +434,17 @@ function convertMessages(model, context) {
+         }
+         else if (msg.role === "assistant") {
+             const output = [];
+            // OpenAI Responses rejects `reasoning` items that are not followed by a `message`.
+            // Tool-call-only turns (thinking + function_call) are valid assistant turns, but
+            // their stored reasoning items must not be replayed as standalone `reasoning` input.
+            const hasTextBlock = msg.content.some((b) => b.type === "text");
+             for (const block of msg.content) {
+                 if (block.type === "thinking" && msg.stopReason !== "error") {
+                     if (block.thinkingSignature) {
+                        if (!hasTextBlock)
+                            continue;
+                         const reasoningItem = JSON.parse(block.thinkingSignature);
+                         output.push(reasoningItem);
+                     }
+                 }
+                 else if (block.type === "text") {
--- a/scripts/test-live-models-docker.sh
+++ b/scripts/test-live-models-docker.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+IMAGE_NAME="${CLAWDBOT_IMAGE:-clawdbot:local}"
+CONFIG_DIR="${CLAWDBOT_CONFIG_DIR:-$HOME/.clawdbot}"
+WORKSPACE_DIR="${CLAWDBOT_WORKSPACE_DIR:-$HOME/clawd}"
+
+echo "==> Build image: $IMAGE_NAME"
+docker build -t "$IMAGE_NAME" -f "$ROOT_DIR/Dockerfile" "$ROOT_DIR"
+
+echo "==> Run live model tests (profile keys)"
+docker run --rm -t \
+  --entrypoint bash \
+  -e HOME=/home/node \
+  -e LIVE=1 \
+  -e CLAWDBOT_LIVE_ALL_MODELS=1 \
+  -e CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS=1 \
+  -v "$CONFIG_DIR":/home/node/.clawdbot \
+  -v "$WORKSPACE_DIR":/home/node/clawd \
+  "$IMAGE_NAME" \
+  -lc "cd /app && pnpm test:live"
+
--- a/src/agents/models.profiles.live.test.ts
+++ b/src/agents/models.profiles.live.test.ts
@@ -0,0 +1,196 @@
+import { type Api, completeSimple, type Model } from "@mariozechner/pi-ai";
+import {
+  discoverAuthStorage,
+  discoverModels,
+} from "@mariozechner/pi-coding-agent";
+import { Type } from "@sinclair/typebox";
+import { describe, expect, it } from "vitest";
+import { loadConfig } from "../config/config.js";
+import { resolveClawdbotAgentDir } from "./agent-paths.js";
+import { getApiKeyForModel } from "./model-auth.js";
+import { ensureClawdbotModelsJson } from "./models-config.js";
+
+const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
+const ALL_MODELS =
+  process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
+  process.env.CLAWDBOT_LIVE_MODELS === "all";
+const REQUIRE_PROFILE_KEYS =
+  process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
+
+const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
+
+function parseModelFilter(raw?: string): Set<string> | null {
+  const trimmed = raw?.trim();
+  if (!trimmed || trimmed === "all") return null;
+  const ids = trimmed
+    .split(",")
+    .map((s) => s.trim())
+    .filter(Boolean);
+  return ids.length ? new Set(ids) : null;
+}
+
+describeLive("live models (profile keys)", () => {
+  it(
+    "completes across configured models",
+    async () => {
+      const cfg = loadConfig();
+      await ensureClawdbotModelsJson(cfg);
+
+      const agentDir = resolveClawdbotAgentDir();
+      const authStorage = discoverAuthStorage(agentDir);
+      const modelRegistry = discoverModels(authStorage, agentDir);
+      const models = modelRegistry.getAll() as Array<Model<Api>>;
+
+      const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
+
+      const failures: Array<{ model: string; error: string }> = [];
+      const skipped: Array<{ model: string; reason: string }> = [];
+
+      for (const model of models) {
+        const id = `${model.provider}/${model.id}`;
+        if (filter && !filter.has(id)) continue;
+
+        let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
+        try {
+          apiKeyInfo = await getApiKeyForModel({ model, cfg });
+        } catch (err) {
+          skipped.push({ model: id, reason: String(err) });
+          continue;
+        }
+
+        if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
+          skipped.push({
+            model: id,
+            reason: `non-profile credential source: ${apiKeyInfo.source}`,
+          });
+          continue;
+        }
+
+        try {
+          // Special regression: OpenAI rejects replayed `reasoning` items for tool-only turns.
+          if (
+            model.provider === "openai" &&
+            model.api === "openai-responses" &&
+            model.id === "gpt-5.2"
+          ) {
+            const noopTool = {
+              name: "noop",
+              description: "Return ok.",
+              parameters: Type.Object({}, { additionalProperties: false }),
+            };
+
+            const first = await completeSimple(
+              model,
+              {
+                messages: [
+                  {
+                    role: "user",
+                    content:
+                      "Call the tool `noop` with {}. Do not write any other text.",
+                    timestamp: Date.now(),
+                  },
+                ],
+                tools: [noopTool],
+              },
+              {
+                apiKey: apiKeyInfo.apiKey,
+                reasoning: model.reasoning ? "low" : undefined,
+                maxTokens: 128,
+                temperature: 0,
+              },
+            );
+
+            const toolCall = first.content.find((b) => b.type === "toolCall");
+            expect(toolCall).toBeTruthy();
+            if (!toolCall || toolCall.type !== "toolCall") {
+              throw new Error("expected tool call");
+            }
+
+            const second = await completeSimple(
+              model,
+              {
+                messages: [
+                  {
+                    role: "user",
+                    content:
+                      "Call the tool `noop` with {}. Do not write any other text.",
+                    timestamp: Date.now(),
+                  },
+                  first,
+                  {
+                    role: "toolResult",
+                    toolCallId: toolCall.id,
+                    toolName: "noop",
+                    content: [{ type: "text", text: "ok" }],
+                    isError: false,
+                    timestamp: Date.now(),
+                  },
+                  {
+                    role: "user",
+                    content: "Reply with the word ok.",
+                    timestamp: Date.now(),
+                  },
+                ],
+              },
+              {
+                apiKey: apiKeyInfo.apiKey,
+                reasoning: model.reasoning ? "low" : undefined,
+                maxTokens: 64,
+                temperature: 0,
+              },
+            );
+
+            const secondText = second.content
+              .filter((b) => b.type === "text")
+              .map((b) => b.text.trim())
+              .join(" ");
+            expect(secondText.length).toBeGreaterThan(0);
+            continue;
+          }
+
+          const res = await completeSimple(
+            model,
+            {
+              messages: [
+                {
+                  role: "user",
+                  content: "Reply with the word ok.",
+                  timestamp: Date.now(),
+                },
+              ],
+            },
+            {
+              apiKey: apiKeyInfo.apiKey,
+              reasoning: model.reasoning ? "low" : undefined,
+              maxTokens: 64,
+              temperature: 0,
+            },
+          );
+
+          const text = res.content
+            .filter((block) => block.type === "text")
+            .map((block) => block.text.trim())
+            .join(" ");
+          expect(text.length).toBeGreaterThan(0);
+        } catch (err) {
+          failures.push({ model: id, error: String(err) });
+        }
+      }
+
+      if (failures.length > 0) {
+        const preview = failures
+          .slice(0, 10)
+          .map((f) => `- ${f.model}: ${f.error}`)
+          .join("\n");
+        throw new Error(
+          `live model failures (${failures.length}):\n${preview}`,
+        );
+      }
+
+      // Keep one assertion so the test fails loudly if we somehow ran nothing.
+      expect(models.length).toBeGreaterThan(0);
+      void skipped;
+    },
+    15 * 60 * 1000,
+  );
+});
--- a/src/agents/openai-responses.reasoning-replay.test.ts
+++ b/src/agents/openai-responses.reasoning-replay.test.ts
@@ -0,0 +1,216 @@
+import type {
+  AssistantMessage,
+  Model,
+  ToolResultMessage,
+} from "@mariozechner/pi-ai";
+import { streamOpenAIResponses } from "@mariozechner/pi-ai";
+import { Type } from "@sinclair/typebox";
+import { describe, expect, it } from "vitest";
+
+function buildModel(): Model<"openai-responses"> {
+  return {
+    id: "gpt-5.2",
+    name: "gpt-5.2",
+    api: "openai-responses",
+    provider: "openai",
+    baseUrl: "https://api.openai.com/v1",
+    reasoning: true,
+    input: ["text"],
+    cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+    contextWindow: 128_000,
+    maxTokens: 4096,
+  };
+}
+
+function installFailingFetchCapture() {
+  const originalFetch = globalThis.fetch;
+  let lastBody: unknown;
+
+  const fetchImpl: typeof fetch = async (_input, init) => {
+    const rawBody = init?.body;
+    const bodyText = (() => {
+      if (!rawBody) return "";
+      if (typeof rawBody === "string") return rawBody;
+      if (rawBody instanceof Uint8Array)
+        return Buffer.from(rawBody).toString("utf8");
+      if (rawBody instanceof ArrayBuffer)
+        return Buffer.from(new Uint8Array(rawBody)).toString("utf8");
+      return String(rawBody);
+    })();
+    lastBody = bodyText ? (JSON.parse(bodyText) as unknown) : undefined;
+    throw new Error("intentional fetch abort (test)");
+  };
+
+  globalThis.fetch = fetchImpl;
+
+  return {
+    getLastBody: () => lastBody as Record<string, unknown> | undefined,
+    restore: () => {
+      globalThis.fetch = originalFetch;
+    },
+  };
+}
+
+describe("openai-responses reasoning replay", () => {
+  it("does not replay standalone reasoning for tool-call-only turns", async () => {
+    const cap = installFailingFetchCapture();
+    try {
+      const model = buildModel();
+
+      const assistantToolOnly: AssistantMessage = {
+        role: "assistant",
+        api: "openai-responses",
+        provider: "openai",
+        model: "gpt-5.2",
+        usage: {
+          input: 0,
+          output: 0,
+          cacheRead: 0,
+          cacheWrite: 0,
+          totalTokens: 0,
+          cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+        },
+        stopReason: "toolUse",
+        timestamp: Date.now(),
+        content: [
+          {
+            type: "thinking",
+            thinking: "internal",
+            thinkingSignature: JSON.stringify({
+              type: "reasoning",
+              id: "rs_test",
+              summary: [],
+            }),
+          },
+          {
+            type: "toolCall",
+            id: "call_123|fc_123",
+            name: "noop",
+            arguments: {},
+          },
+        ],
+      };
+
+      const toolResult: ToolResultMessage = {
+        role: "toolResult",
+        toolCallId: "call_123|fc_123",
+        toolName: "noop",
+        content: [{ type: "text", text: "ok" }],
+        isError: false,
+        timestamp: Date.now(),
+      };
+
+      const stream = streamOpenAIResponses(
+        model,
+        {
+          systemPrompt: "system",
+          messages: [
+            {
+              role: "user",
+              content: "Call noop.",
+              timestamp: Date.now(),
+            },
+            assistantToolOnly,
+            toolResult,
+            {
+              role: "user",
+              content: "Now reply with ok.",
+              timestamp: Date.now(),
+            },
+          ],
+          tools: [
+            {
+              name: "noop",
+              description: "no-op",
+              parameters: Type.Object({}, { additionalProperties: false }),
+            },
+          ],
+        },
+        { apiKey: "test" },
+      );
+
+      await stream.result();
+
+      const body = cap.getLastBody();
+      const input = Array.isArray(body?.input) ? body?.input : [];
+      const types = input
+        .map((item) =>
+          item && typeof item === "object"
+            ? (item as Record<string, unknown>).type
+            : undefined,
+        )
+        .filter((t): t is string => typeof t === "string");
+
+      expect(types).toContain("function_call");
+      expect(types).not.toContain("reasoning");
+    } finally {
+      cap.restore();
+    }
+  });
+
+  it("still replays reasoning when paired with an assistant message", async () => {
+    const cap = installFailingFetchCapture();
+    try {
+      const model = buildModel();
+
+      const assistantWithText: AssistantMessage = {
+        role: "assistant",
+        api: "openai-responses",
+        provider: "openai",
+        model: "gpt-5.2",
+        usage: {
+          input: 0,
+          output: 0,
+          cacheRead: 0,
+          cacheWrite: 0,
+          totalTokens: 0,
+          cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
+        },
+        stopReason: "stop",
+        timestamp: Date.now(),
+        content: [
+          {
+            type: "thinking",
+            thinking: "internal",
+            thinkingSignature: JSON.stringify({
+              type: "reasoning",
+              id: "rs_test",
+              summary: [],
+            }),
+          },
+          { type: "text", text: "hello", textSignature: "msg_test" },
+        ],
+      };
+
+      const stream = streamOpenAIResponses(
+        model,
+        {
+          systemPrompt: "system",
+          messages: [
+            { role: "user", content: "Hi", timestamp: Date.now() },
+            assistantWithText,
+            { role: "user", content: "Ok", timestamp: Date.now() },
+          ],
+        },
+        { apiKey: "test" },
+      );
+
+      await stream.result();
+
+      const body = cap.getLastBody();
+      const input = Array.isArray(body?.input) ? body?.input : [];
+      const types = input
+        .map((item) =>
+          item && typeof item === "object"
+            ? (item as Record<string, unknown>).type
+            : undefined,
+        )
+        .filter((t): t is string => typeof t === "string");
+
+      expect(types).toContain("reasoning");
+      expect(types).toContain("message");
+    } finally {
+      cap.restore();
+    }
+  });
+});
--- a/src/agents/pi-embedded-runner.ts
+++ b/src/agents/pi-embedded-runner.ts
@@ -860,6 +860,7 @@ export async function compactEmbeddedPiSession(params: {
          params.sessionKey ?? params.sessionId,
        );
        const contextFiles = buildBootstrapContextFiles(bootstrapFiles);
+        const runAbortController = new AbortController();
        const tools = createClawdbotCodingTools({
          bash: {
            ...params.config?.tools?.bash,
--- a/src/agents/pi-tools.ts
+++ b/src/agents/pi-tools.ts
@@ -651,7 +651,9 @@ export function createClawdbotCodingTools(options?: {
  // Without this, some providers (notably OpenAI) will reject root-level union schemas.
  const normalized = subagentFiltered.map(normalizeToolParameters);
  const withAbort = options?.abortSignal
-    ? normalized.map((tool) => wrapToolWithAbortSignal(tool, options.abortSignal))
+    ? normalized.map((tool) =>
+        wrapToolWithAbortSignal(tool, options.abortSignal),
+      )
    : normalized;

  // Anthropic blocks specific lowercase tool names (bash, read, write, edit) with OAuth tokens.
--- a/src/auto-reply/reply/abort.ts
+++ b/src/auto-reply/reply/abort.ts
@@ -4,6 +4,7 @@ import {
  loadSessionStore,
  resolveStorePath,
  saveSessionStore,
+  type SessionEntry,
 } from "../../config/sessions.js";
 import {
  parseAgentSessionKey,
@@ -35,7 +36,7 @@ export function setAbortMemory(key: string, value: boolean): void {
 }

 function resolveSessionEntryForKey(
-  store: Record<string, { sessionId: string; updatedAt: number }> | undefined,
+  store: Record<string, SessionEntry> | undefined,
  sessionKey: string | undefined,
 ) {
  if (!store || !sessionKey) return {};
--- a/src/auto-reply/reply/dispatch-from-config.test.ts
+++ b/src/auto-reply/reply/dispatch-from-config.test.ts
@@ -7,7 +7,10 @@ import type { ReplyDispatcher } from "./reply-dispatcher.js";

 const mocks = vi.hoisted(() => ({
  routeReply: vi.fn(async () => ({ ok: true, messageId: "mock" })),
-  tryFastAbortFromMessage: vi.fn(async () => ({ handled: false, aborted: false })),
+  tryFastAbortFromMessage: vi.fn(async () => ({
+    handled: false,
+    aborted: false,
+  })),
 }));

 vi.mock("./route-reply.js", () => ({