From cb10682d3ee87a704c2ad4be516b5b491392d13e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 10 Jan 2026 00:45:10 +0000 Subject: [PATCH] fix(openai): avoid invalid reasoning replay --- patches/@mariozechner__pi-ai.patch | 53 ++++- scripts/test-live-models-docker.sh | 23 ++ src/agents/models.profiles.live.test.ts | 196 ++++++++++++++++ .../openai-responses.reasoning-replay.test.ts | 216 ++++++++++++++++++ src/agents/pi-embedded-runner.ts | 1 + src/agents/pi-tools.ts | 4 +- src/auto-reply/reply/abort.ts | 3 +- .../reply/dispatch-from-config.test.ts | 5 +- 8 files changed, 495 insertions(+), 6 deletions(-) create mode 100755 scripts/test-live-models-docker.sh create mode 100644 src/agents/models.profiles.live.test.ts create mode 100644 src/agents/openai-responses.reasoning-replay.test.ts diff --git a/patches/@mariozechner__pi-ai.patch b/patches/@mariozechner__pi-ai.patch index 65b206c74..4ca0a2e45 100644 --- a/patches/@mariozechner__pi-ai.patch +++ b/patches/@mariozechner__pi-ai.patch @@ -12,6 +12,53 @@ index 0000000..1111111 100644 + console.log(`[pi-ai] 429 rate limit - failing fast to rotate account`); + throw new Error(`Cloud Code Assist API error (${response.status}): ${errorText}`); + } - // Check if retryable - if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) { - // Use server-provided delay or exponential backoff + // Check if retryable + if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) { + // Use server-provided delay or exponential backoff + +diff --git a/dist/providers/openai-responses.js b/dist/providers/openai-responses.js +index 0000000..1111111 100644 +--- a/dist/providers/openai-responses.js ++++ b/dist/providers/openai-responses.js +@@ -397,9 +397,17 @@ function convertMessages(model, context) { + } + else if (msg.role === "assistant") { + const output = []; ++ // OpenAI Responses rejects `reasoning` items that are not followed by a `message`. ++ // Tool-call-only turns (thinking + function_call) are valid assistant turns, but ++ // their stored reasoning items must not be replayed as standalone `reasoning` input. ++ const hasTextBlock = msg.content.some((b) => b.type === "text"); + for (const block of msg.content) { + // Do not submit thinking blocks if the completion had an error (i.e. abort) + if (block.type === "thinking" && msg.stopReason !== "error") { + if (block.thinkingSignature) { ++ if (!hasTextBlock) ++ continue; + const reasoningItem = JSON.parse(block.thinkingSignature); + output.push(reasoningItem); + } + } + else if (block.type === "text") { + +diff --git a/dist/providers/openai-codex-responses.js b/dist/providers/openai-codex-responses.js +index 0000000..1111111 100644 +--- a/dist/providers/openai-codex-responses.js ++++ b/dist/providers/openai-codex-responses.js +@@ -434,9 +434,17 @@ function convertMessages(model, context) { + } + else if (msg.role === "assistant") { + const output = []; ++ // OpenAI Responses rejects `reasoning` items that are not followed by a `message`. ++ // Tool-call-only turns (thinking + function_call) are valid assistant turns, but ++ // their stored reasoning items must not be replayed as standalone `reasoning` input. ++ const hasTextBlock = msg.content.some((b) => b.type === "text"); + for (const block of msg.content) { + if (block.type === "thinking" && msg.stopReason !== "error") { + if (block.thinkingSignature) { ++ if (!hasTextBlock) ++ continue; + const reasoningItem = JSON.parse(block.thinkingSignature); + output.push(reasoningItem); + } + } + else if (block.type === "text") { diff --git a/scripts/test-live-models-docker.sh b/scripts/test-live-models-docker.sh new file mode 100755 index 000000000..4f03b5ab9 --- /dev/null +++ b/scripts/test-live-models-docker.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +IMAGE_NAME="${CLAWDBOT_IMAGE:-clawdbot:local}" +CONFIG_DIR="${CLAWDBOT_CONFIG_DIR:-$HOME/.clawdbot}" +WORKSPACE_DIR="${CLAWDBOT_WORKSPACE_DIR:-$HOME/clawd}" + +echo "==> Build image: $IMAGE_NAME" +docker build -t "$IMAGE_NAME" -f "$ROOT_DIR/Dockerfile" "$ROOT_DIR" + +echo "==> Run live model tests (profile keys)" +docker run --rm -t \ + --entrypoint bash \ + -e HOME=/home/node \ + -e LIVE=1 \ + -e CLAWDBOT_LIVE_ALL_MODELS=1 \ + -e CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS=1 \ + -v "$CONFIG_DIR":/home/node/.clawdbot \ + -v "$WORKSPACE_DIR":/home/node/clawd \ + "$IMAGE_NAME" \ + -lc "cd /app && pnpm test:live" + diff --git a/src/agents/models.profiles.live.test.ts b/src/agents/models.profiles.live.test.ts new file mode 100644 index 000000000..d75c34bc9 --- /dev/null +++ b/src/agents/models.profiles.live.test.ts @@ -0,0 +1,196 @@ +import { type Api, completeSimple, type Model } from "@mariozechner/pi-ai"; +import { + discoverAuthStorage, + discoverModels, +} from "@mariozechner/pi-coding-agent"; +import { Type } from "@sinclair/typebox"; +import { describe, expect, it } from "vitest"; +import { loadConfig } from "../config/config.js"; +import { resolveClawdbotAgentDir } from "./agent-paths.js"; +import { getApiKeyForModel } from "./model-auth.js"; +import { ensureClawdbotModelsJson } from "./models-config.js"; + +const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1"; +const ALL_MODELS = + process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" || + process.env.CLAWDBOT_LIVE_MODELS === "all"; +const REQUIRE_PROFILE_KEYS = + process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1"; + +const describeLive = LIVE && ALL_MODELS ? describe : describe.skip; + +function parseModelFilter(raw?: string): Set | null { + const trimmed = raw?.trim(); + if (!trimmed || trimmed === "all") return null; + const ids = trimmed + .split(",") + .map((s) => s.trim()) + .filter(Boolean); + return ids.length ? new Set(ids) : null; +} + +describeLive("live models (profile keys)", () => { + it( + "completes across configured models", + async () => { + const cfg = loadConfig(); + await ensureClawdbotModelsJson(cfg); + + const agentDir = resolveClawdbotAgentDir(); + const authStorage = discoverAuthStorage(agentDir); + const modelRegistry = discoverModels(authStorage, agentDir); + const models = modelRegistry.getAll() as Array>; + + const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS); + + const failures: Array<{ model: string; error: string }> = []; + const skipped: Array<{ model: string; reason: string }> = []; + + for (const model of models) { + const id = `${model.provider}/${model.id}`; + if (filter && !filter.has(id)) continue; + + let apiKeyInfo: Awaited>; + try { + apiKeyInfo = await getApiKeyForModel({ model, cfg }); + } catch (err) { + skipped.push({ model: id, reason: String(err) }); + continue; + } + + if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) { + skipped.push({ + model: id, + reason: `non-profile credential source: ${apiKeyInfo.source}`, + }); + continue; + } + + try { + // Special regression: OpenAI rejects replayed `reasoning` items for tool-only turns. + if ( + model.provider === "openai" && + model.api === "openai-responses" && + model.id === "gpt-5.2" + ) { + const noopTool = { + name: "noop", + description: "Return ok.", + parameters: Type.Object({}, { additionalProperties: false }), + }; + + const first = await completeSimple( + model, + { + messages: [ + { + role: "user", + content: + "Call the tool `noop` with {}. Do not write any other text.", + timestamp: Date.now(), + }, + ], + tools: [noopTool], + }, + { + apiKey: apiKeyInfo.apiKey, + reasoning: model.reasoning ? "low" : undefined, + maxTokens: 128, + temperature: 0, + }, + ); + + const toolCall = first.content.find((b) => b.type === "toolCall"); + expect(toolCall).toBeTruthy(); + if (!toolCall || toolCall.type !== "toolCall") { + throw new Error("expected tool call"); + } + + const second = await completeSimple( + model, + { + messages: [ + { + role: "user", + content: + "Call the tool `noop` with {}. Do not write any other text.", + timestamp: Date.now(), + }, + first, + { + role: "toolResult", + toolCallId: toolCall.id, + toolName: "noop", + content: [{ type: "text", text: "ok" }], + isError: false, + timestamp: Date.now(), + }, + { + role: "user", + content: "Reply with the word ok.", + timestamp: Date.now(), + }, + ], + }, + { + apiKey: apiKeyInfo.apiKey, + reasoning: model.reasoning ? "low" : undefined, + maxTokens: 64, + temperature: 0, + }, + ); + + const secondText = second.content + .filter((b) => b.type === "text") + .map((b) => b.text.trim()) + .join(" "); + expect(secondText.length).toBeGreaterThan(0); + continue; + } + + const res = await completeSimple( + model, + { + messages: [ + { + role: "user", + content: "Reply with the word ok.", + timestamp: Date.now(), + }, + ], + }, + { + apiKey: apiKeyInfo.apiKey, + reasoning: model.reasoning ? "low" : undefined, + maxTokens: 64, + temperature: 0, + }, + ); + + const text = res.content + .filter((block) => block.type === "text") + .map((block) => block.text.trim()) + .join(" "); + expect(text.length).toBeGreaterThan(0); + } catch (err) { + failures.push({ model: id, error: String(err) }); + } + } + + if (failures.length > 0) { + const preview = failures + .slice(0, 10) + .map((f) => `- ${f.model}: ${f.error}`) + .join("\n"); + throw new Error( + `live model failures (${failures.length}):\n${preview}`, + ); + } + + // Keep one assertion so the test fails loudly if we somehow ran nothing. + expect(models.length).toBeGreaterThan(0); + void skipped; + }, + 15 * 60 * 1000, + ); +}); diff --git a/src/agents/openai-responses.reasoning-replay.test.ts b/src/agents/openai-responses.reasoning-replay.test.ts new file mode 100644 index 000000000..544079330 --- /dev/null +++ b/src/agents/openai-responses.reasoning-replay.test.ts @@ -0,0 +1,216 @@ +import type { + AssistantMessage, + Model, + ToolResultMessage, +} from "@mariozechner/pi-ai"; +import { streamOpenAIResponses } from "@mariozechner/pi-ai"; +import { Type } from "@sinclair/typebox"; +import { describe, expect, it } from "vitest"; + +function buildModel(): Model<"openai-responses"> { + return { + id: "gpt-5.2", + name: "gpt-5.2", + api: "openai-responses", + provider: "openai", + baseUrl: "https://api.openai.com/v1", + reasoning: true, + input: ["text"], + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, + contextWindow: 128_000, + maxTokens: 4096, + }; +} + +function installFailingFetchCapture() { + const originalFetch = globalThis.fetch; + let lastBody: unknown; + + const fetchImpl: typeof fetch = async (_input, init) => { + const rawBody = init?.body; + const bodyText = (() => { + if (!rawBody) return ""; + if (typeof rawBody === "string") return rawBody; + if (rawBody instanceof Uint8Array) + return Buffer.from(rawBody).toString("utf8"); + if (rawBody instanceof ArrayBuffer) + return Buffer.from(new Uint8Array(rawBody)).toString("utf8"); + return String(rawBody); + })(); + lastBody = bodyText ? (JSON.parse(bodyText) as unknown) : undefined; + throw new Error("intentional fetch abort (test)"); + }; + + globalThis.fetch = fetchImpl; + + return { + getLastBody: () => lastBody as Record | undefined, + restore: () => { + globalThis.fetch = originalFetch; + }, + }; +} + +describe("openai-responses reasoning replay", () => { + it("does not replay standalone reasoning for tool-call-only turns", async () => { + const cap = installFailingFetchCapture(); + try { + const model = buildModel(); + + const assistantToolOnly: AssistantMessage = { + role: "assistant", + api: "openai-responses", + provider: "openai", + model: "gpt-5.2", + usage: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + totalTokens: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }, + stopReason: "toolUse", + timestamp: Date.now(), + content: [ + { + type: "thinking", + thinking: "internal", + thinkingSignature: JSON.stringify({ + type: "reasoning", + id: "rs_test", + summary: [], + }), + }, + { + type: "toolCall", + id: "call_123|fc_123", + name: "noop", + arguments: {}, + }, + ], + }; + + const toolResult: ToolResultMessage = { + role: "toolResult", + toolCallId: "call_123|fc_123", + toolName: "noop", + content: [{ type: "text", text: "ok" }], + isError: false, + timestamp: Date.now(), + }; + + const stream = streamOpenAIResponses( + model, + { + systemPrompt: "system", + messages: [ + { + role: "user", + content: "Call noop.", + timestamp: Date.now(), + }, + assistantToolOnly, + toolResult, + { + role: "user", + content: "Now reply with ok.", + timestamp: Date.now(), + }, + ], + tools: [ + { + name: "noop", + description: "no-op", + parameters: Type.Object({}, { additionalProperties: false }), + }, + ], + }, + { apiKey: "test" }, + ); + + await stream.result(); + + const body = cap.getLastBody(); + const input = Array.isArray(body?.input) ? body?.input : []; + const types = input + .map((item) => + item && typeof item === "object" + ? (item as Record).type + : undefined, + ) + .filter((t): t is string => typeof t === "string"); + + expect(types).toContain("function_call"); + expect(types).not.toContain("reasoning"); + } finally { + cap.restore(); + } + }); + + it("still replays reasoning when paired with an assistant message", async () => { + const cap = installFailingFetchCapture(); + try { + const model = buildModel(); + + const assistantWithText: AssistantMessage = { + role: "assistant", + api: "openai-responses", + provider: "openai", + model: "gpt-5.2", + usage: { + input: 0, + output: 0, + cacheRead: 0, + cacheWrite: 0, + totalTokens: 0, + cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 }, + }, + stopReason: "stop", + timestamp: Date.now(), + content: [ + { + type: "thinking", + thinking: "internal", + thinkingSignature: JSON.stringify({ + type: "reasoning", + id: "rs_test", + summary: [], + }), + }, + { type: "text", text: "hello", textSignature: "msg_test" }, + ], + }; + + const stream = streamOpenAIResponses( + model, + { + systemPrompt: "system", + messages: [ + { role: "user", content: "Hi", timestamp: Date.now() }, + assistantWithText, + { role: "user", content: "Ok", timestamp: Date.now() }, + ], + }, + { apiKey: "test" }, + ); + + await stream.result(); + + const body = cap.getLastBody(); + const input = Array.isArray(body?.input) ? body?.input : []; + const types = input + .map((item) => + item && typeof item === "object" + ? (item as Record).type + : undefined, + ) + .filter((t): t is string => typeof t === "string"); + + expect(types).toContain("reasoning"); + expect(types).toContain("message"); + } finally { + cap.restore(); + } + }); +}); diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 7a416c60c..4812444fc 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -860,6 +860,7 @@ export async function compactEmbeddedPiSession(params: { params.sessionKey ?? params.sessionId, ); const contextFiles = buildBootstrapContextFiles(bootstrapFiles); + const runAbortController = new AbortController(); const tools = createClawdbotCodingTools({ bash: { ...params.config?.tools?.bash, diff --git a/src/agents/pi-tools.ts b/src/agents/pi-tools.ts index b330506dc..e68b530d7 100644 --- a/src/agents/pi-tools.ts +++ b/src/agents/pi-tools.ts @@ -651,7 +651,9 @@ export function createClawdbotCodingTools(options?: { // Without this, some providers (notably OpenAI) will reject root-level union schemas. const normalized = subagentFiltered.map(normalizeToolParameters); const withAbort = options?.abortSignal - ? normalized.map((tool) => wrapToolWithAbortSignal(tool, options.abortSignal)) + ? normalized.map((tool) => + wrapToolWithAbortSignal(tool, options.abortSignal), + ) : normalized; // Anthropic blocks specific lowercase tool names (bash, read, write, edit) with OAuth tokens. diff --git a/src/auto-reply/reply/abort.ts b/src/auto-reply/reply/abort.ts index 7f9fefa05..07c00babb 100644 --- a/src/auto-reply/reply/abort.ts +++ b/src/auto-reply/reply/abort.ts @@ -4,6 +4,7 @@ import { loadSessionStore, resolveStorePath, saveSessionStore, + type SessionEntry, } from "../../config/sessions.js"; import { parseAgentSessionKey, @@ -35,7 +36,7 @@ export function setAbortMemory(key: string, value: boolean): void { } function resolveSessionEntryForKey( - store: Record | undefined, + store: Record | undefined, sessionKey: string | undefined, ) { if (!store || !sessionKey) return {}; diff --git a/src/auto-reply/reply/dispatch-from-config.test.ts b/src/auto-reply/reply/dispatch-from-config.test.ts index 557e804d1..26bb863cb 100644 --- a/src/auto-reply/reply/dispatch-from-config.test.ts +++ b/src/auto-reply/reply/dispatch-from-config.test.ts @@ -7,7 +7,10 @@ import type { ReplyDispatcher } from "./reply-dispatcher.js"; const mocks = vi.hoisted(() => ({ routeReply: vi.fn(async () => ({ ok: true, messageId: "mock" })), - tryFastAbortFromMessage: vi.fn(async () => ({ handled: false, aborted: false })), + tryFastAbortFromMessage: vi.fn(async () => ({ + handled: false, + aborted: false, + })), })); vi.mock("./route-reply.js", () => ({