fix(openai): avoid invalid reasoning replay

This commit is contained in:
Peter Steinberger
2026-01-10 00:45:10 +00:00
parent 626b085c85
commit cb10682d3e
8 changed files with 495 additions and 6 deletions

View File

@@ -12,6 +12,53 @@ index 0000000..1111111 100644
+ console.log(`[pi-ai] 429 rate limit - failing fast to rotate account`);
+ throw new Error(`Cloud Code Assist API error (${response.status}): ${errorText}`);
+ }
// Check if retryable
if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) {
// Use server-provided delay or exponential backoff
// Check if retryable
if (attempt < MAX_RETRIES && isRetryableError(response.status, errorText)) {
// Use server-provided delay or exponential backoff
diff --git a/dist/providers/openai-responses.js b/dist/providers/openai-responses.js
index 0000000..1111111 100644
--- a/dist/providers/openai-responses.js
+++ b/dist/providers/openai-responses.js
@@ -397,9 +397,17 @@ function convertMessages(model, context) {
}
else if (msg.role === "assistant") {
const output = [];
+ // OpenAI Responses rejects `reasoning` items that are not followed by a `message`.
+ // Tool-call-only turns (thinking + function_call) are valid assistant turns, but
+ // their stored reasoning items must not be replayed as standalone `reasoning` input.
+ const hasTextBlock = msg.content.some((b) => b.type === "text");
for (const block of msg.content) {
// Do not submit thinking blocks if the completion had an error (i.e. abort)
if (block.type === "thinking" && msg.stopReason !== "error") {
if (block.thinkingSignature) {
+ if (!hasTextBlock)
+ continue;
const reasoningItem = JSON.parse(block.thinkingSignature);
output.push(reasoningItem);
}
}
else if (block.type === "text") {
diff --git a/dist/providers/openai-codex-responses.js b/dist/providers/openai-codex-responses.js
index 0000000..1111111 100644
--- a/dist/providers/openai-codex-responses.js
+++ b/dist/providers/openai-codex-responses.js
@@ -434,9 +434,17 @@ function convertMessages(model, context) {
}
else if (msg.role === "assistant") {
const output = [];
+ // OpenAI Responses rejects `reasoning` items that are not followed by a `message`.
+ // Tool-call-only turns (thinking + function_call) are valid assistant turns, but
+ // their stored reasoning items must not be replayed as standalone `reasoning` input.
+ const hasTextBlock = msg.content.some((b) => b.type === "text");
for (const block of msg.content) {
if (block.type === "thinking" && msg.stopReason !== "error") {
if (block.thinkingSignature) {
+ if (!hasTextBlock)
+ continue;
const reasoningItem = JSON.parse(block.thinkingSignature);
output.push(reasoningItem);
}
}
else if (block.type === "text") {

View File

@@ -0,0 +1,23 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
IMAGE_NAME="${CLAWDBOT_IMAGE:-clawdbot:local}"
CONFIG_DIR="${CLAWDBOT_CONFIG_DIR:-$HOME/.clawdbot}"
WORKSPACE_DIR="${CLAWDBOT_WORKSPACE_DIR:-$HOME/clawd}"
echo "==> Build image: $IMAGE_NAME"
docker build -t "$IMAGE_NAME" -f "$ROOT_DIR/Dockerfile" "$ROOT_DIR"
echo "==> Run live model tests (profile keys)"
docker run --rm -t \
--entrypoint bash \
-e HOME=/home/node \
-e LIVE=1 \
-e CLAWDBOT_LIVE_ALL_MODELS=1 \
-e CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS=1 \
-v "$CONFIG_DIR":/home/node/.clawdbot \
-v "$WORKSPACE_DIR":/home/node/clawd \
"$IMAGE_NAME" \
-lc "cd /app && pnpm test:live"

View File

@@ -0,0 +1,196 @@
import { type Api, completeSimple, type Model } from "@mariozechner/pi-ai";
import {
discoverAuthStorage,
discoverModels,
} from "@mariozechner/pi-coding-agent";
import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
import { loadConfig } from "../config/config.js";
import { resolveClawdbotAgentDir } from "./agent-paths.js";
import { getApiKeyForModel } from "./model-auth.js";
import { ensureClawdbotModelsJson } from "./models-config.js";
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
const ALL_MODELS =
process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
process.env.CLAWDBOT_LIVE_MODELS === "all";
const REQUIRE_PROFILE_KEYS =
process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
function parseModelFilter(raw?: string): Set<string> | null {
const trimmed = raw?.trim();
if (!trimmed || trimmed === "all") return null;
const ids = trimmed
.split(",")
.map((s) => s.trim())
.filter(Boolean);
return ids.length ? new Set(ids) : null;
}
describeLive("live models (profile keys)", () => {
it(
"completes across configured models",
async () => {
const cfg = loadConfig();
await ensureClawdbotModelsJson(cfg);
const agentDir = resolveClawdbotAgentDir();
const authStorage = discoverAuthStorage(agentDir);
const modelRegistry = discoverModels(authStorage, agentDir);
const models = modelRegistry.getAll() as Array<Model<Api>>;
const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
const failures: Array<{ model: string; error: string }> = [];
const skipped: Array<{ model: string; reason: string }> = [];
for (const model of models) {
const id = `${model.provider}/${model.id}`;
if (filter && !filter.has(id)) continue;
let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
try {
apiKeyInfo = await getApiKeyForModel({ model, cfg });
} catch (err) {
skipped.push({ model: id, reason: String(err) });
continue;
}
if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
skipped.push({
model: id,
reason: `non-profile credential source: ${apiKeyInfo.source}`,
});
continue;
}
try {
// Special regression: OpenAI rejects replayed `reasoning` items for tool-only turns.
if (
model.provider === "openai" &&
model.api === "openai-responses" &&
model.id === "gpt-5.2"
) {
const noopTool = {
name: "noop",
description: "Return ok.",
parameters: Type.Object({}, { additionalProperties: false }),
};
const first = await completeSimple(
model,
{
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
],
tools: [noopTool],
},
{
apiKey: apiKeyInfo.apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 128,
temperature: 0,
},
);
const toolCall = first.content.find((b) => b.type === "toolCall");
expect(toolCall).toBeTruthy();
if (!toolCall || toolCall.type !== "toolCall") {
throw new Error("expected tool call");
}
const second = await completeSimple(
model,
{
messages: [
{
role: "user",
content:
"Call the tool `noop` with {}. Do not write any other text.",
timestamp: Date.now(),
},
first,
{
role: "toolResult",
toolCallId: toolCall.id,
toolName: "noop",
content: [{ type: "text", text: "ok" }],
isError: false,
timestamp: Date.now(),
},
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey: apiKeyInfo.apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 64,
temperature: 0,
},
);
const secondText = second.content
.filter((b) => b.type === "text")
.map((b) => b.text.trim())
.join(" ");
expect(secondText.length).toBeGreaterThan(0);
continue;
}
const res = await completeSimple(
model,
{
messages: [
{
role: "user",
content: "Reply with the word ok.",
timestamp: Date.now(),
},
],
},
{
apiKey: apiKeyInfo.apiKey,
reasoning: model.reasoning ? "low" : undefined,
maxTokens: 64,
temperature: 0,
},
);
const text = res.content
.filter((block) => block.type === "text")
.map((block) => block.text.trim())
.join(" ");
expect(text.length).toBeGreaterThan(0);
} catch (err) {
failures.push({ model: id, error: String(err) });
}
}
if (failures.length > 0) {
const preview = failures
.slice(0, 10)
.map((f) => `- ${f.model}: ${f.error}`)
.join("\n");
throw new Error(
`live model failures (${failures.length}):\n${preview}`,
);
}
// Keep one assertion so the test fails loudly if we somehow ran nothing.
expect(models.length).toBeGreaterThan(0);
void skipped;
},
15 * 60 * 1000,
);
});

View File

@@ -0,0 +1,216 @@
import type {
AssistantMessage,
Model,
ToolResultMessage,
} from "@mariozechner/pi-ai";
import { streamOpenAIResponses } from "@mariozechner/pi-ai";
import { Type } from "@sinclair/typebox";
import { describe, expect, it } from "vitest";
function buildModel(): Model<"openai-responses"> {
return {
id: "gpt-5.2",
name: "gpt-5.2",
api: "openai-responses",
provider: "openai",
baseUrl: "https://api.openai.com/v1",
reasoning: true,
input: ["text"],
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 128_000,
maxTokens: 4096,
};
}
function installFailingFetchCapture() {
const originalFetch = globalThis.fetch;
let lastBody: unknown;
const fetchImpl: typeof fetch = async (_input, init) => {
const rawBody = init?.body;
const bodyText = (() => {
if (!rawBody) return "";
if (typeof rawBody === "string") return rawBody;
if (rawBody instanceof Uint8Array)
return Buffer.from(rawBody).toString("utf8");
if (rawBody instanceof ArrayBuffer)
return Buffer.from(new Uint8Array(rawBody)).toString("utf8");
return String(rawBody);
})();
lastBody = bodyText ? (JSON.parse(bodyText) as unknown) : undefined;
throw new Error("intentional fetch abort (test)");
};
globalThis.fetch = fetchImpl;
return {
getLastBody: () => lastBody as Record<string, unknown> | undefined,
restore: () => {
globalThis.fetch = originalFetch;
},
};
}
describe("openai-responses reasoning replay", () => {
it("does not replay standalone reasoning for tool-call-only turns", async () => {
const cap = installFailingFetchCapture();
try {
const model = buildModel();
const assistantToolOnly: AssistantMessage = {
role: "assistant",
api: "openai-responses",
provider: "openai",
model: "gpt-5.2",
usage: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
totalTokens: 0,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
},
stopReason: "toolUse",
timestamp: Date.now(),
content: [
{
type: "thinking",
thinking: "internal",
thinkingSignature: JSON.stringify({
type: "reasoning",
id: "rs_test",
summary: [],
}),
},
{
type: "toolCall",
id: "call_123|fc_123",
name: "noop",
arguments: {},
},
],
};
const toolResult: ToolResultMessage = {
role: "toolResult",
toolCallId: "call_123|fc_123",
toolName: "noop",
content: [{ type: "text", text: "ok" }],
isError: false,
timestamp: Date.now(),
};
const stream = streamOpenAIResponses(
model,
{
systemPrompt: "system",
messages: [
{
role: "user",
content: "Call noop.",
timestamp: Date.now(),
},
assistantToolOnly,
toolResult,
{
role: "user",
content: "Now reply with ok.",
timestamp: Date.now(),
},
],
tools: [
{
name: "noop",
description: "no-op",
parameters: Type.Object({}, { additionalProperties: false }),
},
],
},
{ apiKey: "test" },
);
await stream.result();
const body = cap.getLastBody();
const input = Array.isArray(body?.input) ? body?.input : [];
const types = input
.map((item) =>
item && typeof item === "object"
? (item as Record<string, unknown>).type
: undefined,
)
.filter((t): t is string => typeof t === "string");
expect(types).toContain("function_call");
expect(types).not.toContain("reasoning");
} finally {
cap.restore();
}
});
it("still replays reasoning when paired with an assistant message", async () => {
const cap = installFailingFetchCapture();
try {
const model = buildModel();
const assistantWithText: AssistantMessage = {
role: "assistant",
api: "openai-responses",
provider: "openai",
model: "gpt-5.2",
usage: {
input: 0,
output: 0,
cacheRead: 0,
cacheWrite: 0,
totalTokens: 0,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
},
stopReason: "stop",
timestamp: Date.now(),
content: [
{
type: "thinking",
thinking: "internal",
thinkingSignature: JSON.stringify({
type: "reasoning",
id: "rs_test",
summary: [],
}),
},
{ type: "text", text: "hello", textSignature: "msg_test" },
],
};
const stream = streamOpenAIResponses(
model,
{
systemPrompt: "system",
messages: [
{ role: "user", content: "Hi", timestamp: Date.now() },
assistantWithText,
{ role: "user", content: "Ok", timestamp: Date.now() },
],
},
{ apiKey: "test" },
);
await stream.result();
const body = cap.getLastBody();
const input = Array.isArray(body?.input) ? body?.input : [];
const types = input
.map((item) =>
item && typeof item === "object"
? (item as Record<string, unknown>).type
: undefined,
)
.filter((t): t is string => typeof t === "string");
expect(types).toContain("reasoning");
expect(types).toContain("message");
} finally {
cap.restore();
}
});
});

View File

@@ -860,6 +860,7 @@ export async function compactEmbeddedPiSession(params: {
params.sessionKey ?? params.sessionId,
);
const contextFiles = buildBootstrapContextFiles(bootstrapFiles);
const runAbortController = new AbortController();
const tools = createClawdbotCodingTools({
bash: {
...params.config?.tools?.bash,

View File

@@ -651,7 +651,9 @@ export function createClawdbotCodingTools(options?: {
// Without this, some providers (notably OpenAI) will reject root-level union schemas.
const normalized = subagentFiltered.map(normalizeToolParameters);
const withAbort = options?.abortSignal
? normalized.map((tool) => wrapToolWithAbortSignal(tool, options.abortSignal))
? normalized.map((tool) =>
wrapToolWithAbortSignal(tool, options.abortSignal),
)
: normalized;
// Anthropic blocks specific lowercase tool names (bash, read, write, edit) with OAuth tokens.

View File

@@ -4,6 +4,7 @@ import {
loadSessionStore,
resolveStorePath,
saveSessionStore,
type SessionEntry,
} from "../../config/sessions.js";
import {
parseAgentSessionKey,
@@ -35,7 +36,7 @@ export function setAbortMemory(key: string, value: boolean): void {
}
function resolveSessionEntryForKey(
store: Record<string, { sessionId: string; updatedAt: number }> | undefined,
store: Record<string, SessionEntry> | undefined,
sessionKey: string | undefined,
) {
if (!store || !sessionKey) return {};

View File

@@ -7,7 +7,10 @@ import type { ReplyDispatcher } from "./reply-dispatcher.js";
const mocks = vi.hoisted(() => ({
routeReply: vi.fn(async () => ({ ok: true, messageId: "mock" })),
tryFastAbortFromMessage: vi.fn(async () => ({ handled: false, aborted: false })),
tryFastAbortFromMessage: vi.fn(async () => ({
handled: false,
aborted: false,
})),
}));
vi.mock("./route-reply.js", () => ({