fix(openai): avoid invalid reasoning replay
This commit is contained in:
196
src/agents/models.profiles.live.test.ts
Normal file
196
src/agents/models.profiles.live.test.ts
Normal file
@@ -0,0 +1,196 @@
|
||||
import { type Api, completeSimple, type Model } from "@mariozechner/pi-ai";
|
||||
import {
|
||||
discoverAuthStorage,
|
||||
discoverModels,
|
||||
} from "@mariozechner/pi-coding-agent";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { loadConfig } from "../config/config.js";
|
||||
import { resolveClawdbotAgentDir } from "./agent-paths.js";
|
||||
import { getApiKeyForModel } from "./model-auth.js";
|
||||
import { ensureClawdbotModelsJson } from "./models-config.js";
|
||||
|
||||
const LIVE = process.env.LIVE === "1" || process.env.CLAWDBOT_LIVE_TEST === "1";
|
||||
const ALL_MODELS =
|
||||
process.env.CLAWDBOT_LIVE_ALL_MODELS === "1" ||
|
||||
process.env.CLAWDBOT_LIVE_MODELS === "all";
|
||||
const REQUIRE_PROFILE_KEYS =
|
||||
process.env.CLAWDBOT_LIVE_REQUIRE_PROFILE_KEYS === "1";
|
||||
|
||||
const describeLive = LIVE && ALL_MODELS ? describe : describe.skip;
|
||||
|
||||
function parseModelFilter(raw?: string): Set<string> | null {
|
||||
const trimmed = raw?.trim();
|
||||
if (!trimmed || trimmed === "all") return null;
|
||||
const ids = trimmed
|
||||
.split(",")
|
||||
.map((s) => s.trim())
|
||||
.filter(Boolean);
|
||||
return ids.length ? new Set(ids) : null;
|
||||
}
|
||||
|
||||
describeLive("live models (profile keys)", () => {
|
||||
it(
|
||||
"completes across configured models",
|
||||
async () => {
|
||||
const cfg = loadConfig();
|
||||
await ensureClawdbotModelsJson(cfg);
|
||||
|
||||
const agentDir = resolveClawdbotAgentDir();
|
||||
const authStorage = discoverAuthStorage(agentDir);
|
||||
const modelRegistry = discoverModels(authStorage, agentDir);
|
||||
const models = modelRegistry.getAll() as Array<Model<Api>>;
|
||||
|
||||
const filter = parseModelFilter(process.env.CLAWDBOT_LIVE_MODELS);
|
||||
|
||||
const failures: Array<{ model: string; error: string }> = [];
|
||||
const skipped: Array<{ model: string; reason: string }> = [];
|
||||
|
||||
for (const model of models) {
|
||||
const id = `${model.provider}/${model.id}`;
|
||||
if (filter && !filter.has(id)) continue;
|
||||
|
||||
let apiKeyInfo: Awaited<ReturnType<typeof getApiKeyForModel>>;
|
||||
try {
|
||||
apiKeyInfo = await getApiKeyForModel({ model, cfg });
|
||||
} catch (err) {
|
||||
skipped.push({ model: id, reason: String(err) });
|
||||
continue;
|
||||
}
|
||||
|
||||
if (REQUIRE_PROFILE_KEYS && !apiKeyInfo.source.startsWith("profile:")) {
|
||||
skipped.push({
|
||||
model: id,
|
||||
reason: `non-profile credential source: ${apiKeyInfo.source}`,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
// Special regression: OpenAI rejects replayed `reasoning` items for tool-only turns.
|
||||
if (
|
||||
model.provider === "openai" &&
|
||||
model.api === "openai-responses" &&
|
||||
model.id === "gpt-5.2"
|
||||
) {
|
||||
const noopTool = {
|
||||
name: "noop",
|
||||
description: "Return ok.",
|
||||
parameters: Type.Object({}, { additionalProperties: false }),
|
||||
};
|
||||
|
||||
const first = await completeSimple(
|
||||
model,
|
||||
{
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
"Call the tool `noop` with {}. Do not write any other text.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
tools: [noopTool],
|
||||
},
|
||||
{
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
reasoning: model.reasoning ? "low" : undefined,
|
||||
maxTokens: 128,
|
||||
temperature: 0,
|
||||
},
|
||||
);
|
||||
|
||||
const toolCall = first.content.find((b) => b.type === "toolCall");
|
||||
expect(toolCall).toBeTruthy();
|
||||
if (!toolCall || toolCall.type !== "toolCall") {
|
||||
throw new Error("expected tool call");
|
||||
}
|
||||
|
||||
const second = await completeSimple(
|
||||
model,
|
||||
{
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
"Call the tool `noop` with {}. Do not write any other text.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
first,
|
||||
{
|
||||
role: "toolResult",
|
||||
toolCallId: toolCall.id,
|
||||
toolName: "noop",
|
||||
content: [{ type: "text", text: "ok" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: "Reply with the word ok.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
reasoning: model.reasoning ? "low" : undefined,
|
||||
maxTokens: 64,
|
||||
temperature: 0,
|
||||
},
|
||||
);
|
||||
|
||||
const secondText = second.content
|
||||
.filter((b) => b.type === "text")
|
||||
.map((b) => b.text.trim())
|
||||
.join(" ");
|
||||
expect(secondText.length).toBeGreaterThan(0);
|
||||
continue;
|
||||
}
|
||||
|
||||
const res = await completeSimple(
|
||||
model,
|
||||
{
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Reply with the word ok.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
apiKey: apiKeyInfo.apiKey,
|
||||
reasoning: model.reasoning ? "low" : undefined,
|
||||
maxTokens: 64,
|
||||
temperature: 0,
|
||||
},
|
||||
);
|
||||
|
||||
const text = res.content
|
||||
.filter((block) => block.type === "text")
|
||||
.map((block) => block.text.trim())
|
||||
.join(" ");
|
||||
expect(text.length).toBeGreaterThan(0);
|
||||
} catch (err) {
|
||||
failures.push({ model: id, error: String(err) });
|
||||
}
|
||||
}
|
||||
|
||||
if (failures.length > 0) {
|
||||
const preview = failures
|
||||
.slice(0, 10)
|
||||
.map((f) => `- ${f.model}: ${f.error}`)
|
||||
.join("\n");
|
||||
throw new Error(
|
||||
`live model failures (${failures.length}):\n${preview}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Keep one assertion so the test fails loudly if we somehow ran nothing.
|
||||
expect(models.length).toBeGreaterThan(0);
|
||||
void skipped;
|
||||
},
|
||||
15 * 60 * 1000,
|
||||
);
|
||||
});
|
||||
216
src/agents/openai-responses.reasoning-replay.test.ts
Normal file
216
src/agents/openai-responses.reasoning-replay.test.ts
Normal file
@@ -0,0 +1,216 @@
|
||||
import type {
|
||||
AssistantMessage,
|
||||
Model,
|
||||
ToolResultMessage,
|
||||
} from "@mariozechner/pi-ai";
|
||||
import { streamOpenAIResponses } from "@mariozechner/pi-ai";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
function buildModel(): Model<"openai-responses"> {
|
||||
return {
|
||||
id: "gpt-5.2",
|
||||
name: "gpt-5.2",
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
baseUrl: "https://api.openai.com/v1",
|
||||
reasoning: true,
|
||||
input: ["text"],
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
|
||||
contextWindow: 128_000,
|
||||
maxTokens: 4096,
|
||||
};
|
||||
}
|
||||
|
||||
function installFailingFetchCapture() {
|
||||
const originalFetch = globalThis.fetch;
|
||||
let lastBody: unknown;
|
||||
|
||||
const fetchImpl: typeof fetch = async (_input, init) => {
|
||||
const rawBody = init?.body;
|
||||
const bodyText = (() => {
|
||||
if (!rawBody) return "";
|
||||
if (typeof rawBody === "string") return rawBody;
|
||||
if (rawBody instanceof Uint8Array)
|
||||
return Buffer.from(rawBody).toString("utf8");
|
||||
if (rawBody instanceof ArrayBuffer)
|
||||
return Buffer.from(new Uint8Array(rawBody)).toString("utf8");
|
||||
return String(rawBody);
|
||||
})();
|
||||
lastBody = bodyText ? (JSON.parse(bodyText) as unknown) : undefined;
|
||||
throw new Error("intentional fetch abort (test)");
|
||||
};
|
||||
|
||||
globalThis.fetch = fetchImpl;
|
||||
|
||||
return {
|
||||
getLastBody: () => lastBody as Record<string, unknown> | undefined,
|
||||
restore: () => {
|
||||
globalThis.fetch = originalFetch;
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
describe("openai-responses reasoning replay", () => {
|
||||
it("does not replay standalone reasoning for tool-call-only turns", async () => {
|
||||
const cap = installFailingFetchCapture();
|
||||
try {
|
||||
const model = buildModel();
|
||||
|
||||
const assistantToolOnly: AssistantMessage = {
|
||||
role: "assistant",
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
model: "gpt-5.2",
|
||||
usage: {
|
||||
input: 0,
|
||||
output: 0,
|
||||
cacheRead: 0,
|
||||
cacheWrite: 0,
|
||||
totalTokens: 0,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
||||
},
|
||||
stopReason: "toolUse",
|
||||
timestamp: Date.now(),
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "internal",
|
||||
thinkingSignature: JSON.stringify({
|
||||
type: "reasoning",
|
||||
id: "rs_test",
|
||||
summary: [],
|
||||
}),
|
||||
},
|
||||
{
|
||||
type: "toolCall",
|
||||
id: "call_123|fc_123",
|
||||
name: "noop",
|
||||
arguments: {},
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const toolResult: ToolResultMessage = {
|
||||
role: "toolResult",
|
||||
toolCallId: "call_123|fc_123",
|
||||
toolName: "noop",
|
||||
content: [{ type: "text", text: "ok" }],
|
||||
isError: false,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
const stream = streamOpenAIResponses(
|
||||
model,
|
||||
{
|
||||
systemPrompt: "system",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: "Call noop.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
assistantToolOnly,
|
||||
toolResult,
|
||||
{
|
||||
role: "user",
|
||||
content: "Now reply with ok.",
|
||||
timestamp: Date.now(),
|
||||
},
|
||||
],
|
||||
tools: [
|
||||
{
|
||||
name: "noop",
|
||||
description: "no-op",
|
||||
parameters: Type.Object({}, { additionalProperties: false }),
|
||||
},
|
||||
],
|
||||
},
|
||||
{ apiKey: "test" },
|
||||
);
|
||||
|
||||
await stream.result();
|
||||
|
||||
const body = cap.getLastBody();
|
||||
const input = Array.isArray(body?.input) ? body?.input : [];
|
||||
const types = input
|
||||
.map((item) =>
|
||||
item && typeof item === "object"
|
||||
? (item as Record<string, unknown>).type
|
||||
: undefined,
|
||||
)
|
||||
.filter((t): t is string => typeof t === "string");
|
||||
|
||||
expect(types).toContain("function_call");
|
||||
expect(types).not.toContain("reasoning");
|
||||
} finally {
|
||||
cap.restore();
|
||||
}
|
||||
});
|
||||
|
||||
it("still replays reasoning when paired with an assistant message", async () => {
|
||||
const cap = installFailingFetchCapture();
|
||||
try {
|
||||
const model = buildModel();
|
||||
|
||||
const assistantWithText: AssistantMessage = {
|
||||
role: "assistant",
|
||||
api: "openai-responses",
|
||||
provider: "openai",
|
||||
model: "gpt-5.2",
|
||||
usage: {
|
||||
input: 0,
|
||||
output: 0,
|
||||
cacheRead: 0,
|
||||
cacheWrite: 0,
|
||||
totalTokens: 0,
|
||||
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
|
||||
},
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "internal",
|
||||
thinkingSignature: JSON.stringify({
|
||||
type: "reasoning",
|
||||
id: "rs_test",
|
||||
summary: [],
|
||||
}),
|
||||
},
|
||||
{ type: "text", text: "hello", textSignature: "msg_test" },
|
||||
],
|
||||
};
|
||||
|
||||
const stream = streamOpenAIResponses(
|
||||
model,
|
||||
{
|
||||
systemPrompt: "system",
|
||||
messages: [
|
||||
{ role: "user", content: "Hi", timestamp: Date.now() },
|
||||
assistantWithText,
|
||||
{ role: "user", content: "Ok", timestamp: Date.now() },
|
||||
],
|
||||
},
|
||||
{ apiKey: "test" },
|
||||
);
|
||||
|
||||
await stream.result();
|
||||
|
||||
const body = cap.getLastBody();
|
||||
const input = Array.isArray(body?.input) ? body?.input : [];
|
||||
const types = input
|
||||
.map((item) =>
|
||||
item && typeof item === "object"
|
||||
? (item as Record<string, unknown>).type
|
||||
: undefined,
|
||||
)
|
||||
.filter((t): t is string => typeof t === "string");
|
||||
|
||||
expect(types).toContain("reasoning");
|
||||
expect(types).toContain("message");
|
||||
} finally {
|
||||
cap.restore();
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -860,6 +860,7 @@ export async function compactEmbeddedPiSession(params: {
|
||||
params.sessionKey ?? params.sessionId,
|
||||
);
|
||||
const contextFiles = buildBootstrapContextFiles(bootstrapFiles);
|
||||
const runAbortController = new AbortController();
|
||||
const tools = createClawdbotCodingTools({
|
||||
bash: {
|
||||
...params.config?.tools?.bash,
|
||||
|
||||
@@ -651,7 +651,9 @@ export function createClawdbotCodingTools(options?: {
|
||||
// Without this, some providers (notably OpenAI) will reject root-level union schemas.
|
||||
const normalized = subagentFiltered.map(normalizeToolParameters);
|
||||
const withAbort = options?.abortSignal
|
||||
? normalized.map((tool) => wrapToolWithAbortSignal(tool, options.abortSignal))
|
||||
? normalized.map((tool) =>
|
||||
wrapToolWithAbortSignal(tool, options.abortSignal),
|
||||
)
|
||||
: normalized;
|
||||
|
||||
// Anthropic blocks specific lowercase tool names (bash, read, write, edit) with OAuth tokens.
|
||||
|
||||
@@ -4,6 +4,7 @@ import {
|
||||
loadSessionStore,
|
||||
resolveStorePath,
|
||||
saveSessionStore,
|
||||
type SessionEntry,
|
||||
} from "../../config/sessions.js";
|
||||
import {
|
||||
parseAgentSessionKey,
|
||||
@@ -35,7 +36,7 @@ export function setAbortMemory(key: string, value: boolean): void {
|
||||
}
|
||||
|
||||
function resolveSessionEntryForKey(
|
||||
store: Record<string, { sessionId: string; updatedAt: number }> | undefined,
|
||||
store: Record<string, SessionEntry> | undefined,
|
||||
sessionKey: string | undefined,
|
||||
) {
|
||||
if (!store || !sessionKey) return {};
|
||||
|
||||
@@ -7,7 +7,10 @@ import type { ReplyDispatcher } from "./reply-dispatcher.js";
|
||||
|
||||
const mocks = vi.hoisted(() => ({
|
||||
routeReply: vi.fn(async () => ({ ok: true, messageId: "mock" })),
|
||||
tryFastAbortFromMessage: vi.fn(async () => ({ handled: false, aborted: false })),
|
||||
tryFastAbortFromMessage: vi.fn(async () => ({
|
||||
handled: false,
|
||||
aborted: false,
|
||||
})),
|
||||
}));
|
||||
|
||||
vi.mock("./route-reply.js", () => ({
|
||||
|
||||
Reference in New Issue
Block a user