From 435edaf99734a1530a532f5f8c4556f952b419d6 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Mon, 5 Jan 2026 00:15:42 +0100 Subject: [PATCH] fix: OpenAI tool schema compatibility --- CHANGELOG.md | 1 + src/agents/pi-embedded-runner.ts | 2 + src/agents/pi-tools.test.ts | 92 +++++++++++++--------- src/agents/pi-tools.ts | 45 ++++++++++- src/agents/tools/browser-tool.ts | 131 +++++++++---------------------- 5 files changed, 138 insertions(+), 133 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 299d1bb6b..43c55a35f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - macOS: treat location permission as always-only to avoid iOS-only enums. (#165) — thanks @Nachx639 - macOS: make generated gateway protocol models `Sendable` for Swift 6 strict concurrency. (#195) — thanks @andranik-sahakyan - WhatsApp: suppress typing indicator during heartbeat background tasks. (#190) — thanks @mcinteerj +- Agent tools: OpenAI-compatible tool JSON Schemas (fix `browser`, normalize union schemas). - Onboarding: when running from source, auto-build missing Control UI assets (`pnpm ui:build`). - Discord/Slack: route reaction + system notifications to the correct session (no main-session bleed). - Agent tools: honor `agent.tools` allow/deny policy even when sandbox is off. diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 9184cbb86..4472c8dc1 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -364,6 +364,8 @@ export async function runEmbeddedPiAgent(params: { await loadWorkspaceBootstrapFiles(resolvedWorkspace); const contextFiles = buildBootstrapContextFiles(bootstrapFiles); const promptSkills = resolvePromptSkills(skillsSnapshot, skillEntries); + // Tool schemas must be provider-compatible (OpenAI requires top-level `type: "object"`). + // `createClawdbotCodingTools()` normalizes schemas so the session can pass them through unchanged. const tools = createClawdbotCodingTools({ bash: { ...params.config?.agent?.bash, diff --git a/src/agents/pi-tools.test.ts b/src/agents/pi-tools.test.ts index d38c5bf3f..a9cb42d44 100644 --- a/src/agents/pi-tools.test.ts +++ b/src/agents/pi-tools.test.ts @@ -5,8 +5,16 @@ import path from "node:path"; import sharp from "sharp"; import { describe, expect, it } from "vitest"; import { createClawdbotCodingTools } from "./pi-tools.js"; +import { createBrowserTool } from "./tools/browser-tool.js"; describe("createClawdbotCodingTools", () => { + it("keeps browser tool schema OpenAI-compatible without normalization", () => { + const browser = createBrowserTool(); + const schema = browser.parameters as { type?: unknown; anyOf?: unknown }; + expect(schema.type).toBe("object"); + expect(schema.anyOf).toBeUndefined(); + }); + it("merges properties for union tool schemas", () => { const tools = createClawdbotCodingTools(); const browser = tools.find((tool) => tool.name === "browser"); @@ -23,54 +31,47 @@ describe("createClawdbotCodingTools", () => { expect(parameters.required ?? []).toContain("action"); }); - it("preserves union action values in merged schema", () => { + it("preserves action enums in normalized schemas", () => { const tools = createClawdbotCodingTools(); const toolNames = ["browser", "canvas", "nodes", "cron", "gateway"]; + const collectActionValues = ( + schema: unknown, + values: Set, + ): void => { + if (!schema || typeof schema !== "object") return; + const record = schema as Record; + if (typeof record.const === "string") values.add(record.const); + if (Array.isArray(record.enum)) { + for (const value of record.enum) { + if (typeof value === "string") values.add(value); + } + } + if (Array.isArray(record.anyOf)) { + for (const variant of record.anyOf) { + collectActionValues(variant, values); + } + } + }; + for (const name of toolNames) { const tool = tools.find((candidate) => candidate.name === name); expect(tool).toBeDefined(); const parameters = tool?.parameters as { - anyOf?: Array<{ properties?: Record }>; properties?: Record; }; - if (!Array.isArray(parameters.anyOf) || parameters.anyOf.length === 0) { - continue; - } - const actionValues = new Set(); - for (const variant of parameters.anyOf ?? []) { - const action = variant?.properties?.action as - | { const?: unknown; enum?: unknown[] } - | undefined; - if (typeof action?.const === "string") actionValues.add(action.const); - if (Array.isArray(action?.enum)) { - for (const value of action.enum) { - if (typeof value === "string") actionValues.add(value); - } - } - } - - if (actionValues.size <= 1) { - continue; - } - const mergedAction = parameters.properties?.action as + const action = parameters.properties?.action as | { const?: unknown; enum?: unknown[] } | undefined; - const mergedValues = new Set(); - if (typeof mergedAction?.const === "string") { - mergedValues.add(mergedAction.const); - } - if (Array.isArray(mergedAction?.enum)) { - for (const value of mergedAction.enum) { - if (typeof value === "string") mergedValues.add(value); - } - } + const values = new Set(); + collectActionValues(action, values); - expect(actionValues.size).toBeGreaterThan(1); - expect(mergedValues.size).toBe(actionValues.size); - for (const value of actionValues) { - expect(mergedValues.has(value)).toBe(true); - } + const min = + name === "gateway" + ? 1 + : // Most tools expose multiple actions; keep this signal so schemas stay useful to models. + 2; + expect(values.size).toBeGreaterThanOrEqual(min); } }); @@ -80,6 +81,25 @@ describe("createClawdbotCodingTools", () => { expect(tools.some((tool) => tool.name === "process")).toBe(true); }); + it("provides top-level object schemas for all tools", () => { + const tools = createClawdbotCodingTools(); + const offenders = tools + .map((tool) => { + const schema = + tool.parameters && typeof tool.parameters === "object" + ? (tool.parameters as Record) + : null; + return { + name: tool.name, + type: schema?.type, + keys: schema ? Object.keys(schema).sort() : null, + }; + }) + .filter((entry) => entry.type !== "object"); + + expect(offenders).toEqual([]); + }); + it("scopes discord tool to discord surface", () => { const other = createClawdbotCodingTools({ surface: "whatsapp" }); expect(other.some((tool) => tool.name === "discord")).toBe(false); diff --git a/src/agents/pi-tools.ts b/src/agents/pi-tools.ts index 646e53195..0123fcb59 100644 --- a/src/agents/pi-tools.ts +++ b/src/agents/pi-tools.ts @@ -218,6 +218,13 @@ function normalizeToolParameters(tool: AnyAgentTool): AnyAgentTool { : undefined; if (!schema) return tool; + // Provider quirks: + // - Gemini rejects several JSON Schema keywords, so we scrub those. + // - OpenAI rejects function tool schemas unless the *top-level* is `type: "object"`. + // (TypeBox root unions compile to `{ anyOf: [...] }` without `type`). + // + // Normalize once here so callers can always pass `tools` through unchanged. + // If schema already has type + properties (no top-level anyOf to merge), // still clean it for Gemini compatibility if ( @@ -231,12 +238,33 @@ function normalizeToolParameters(tool: AnyAgentTool): AnyAgentTool { }; } - if (!Array.isArray(schema.anyOf)) return tool; + // Some tool schemas (esp. unions) may omit `type` at the top-level. If we see + // object-ish fields, force `type: "object"` so OpenAI accepts the schema. + if ( + !("type" in schema) && + (typeof schema.properties === "object" || + Array.isArray(schema.required)) && + !Array.isArray(schema.anyOf) && + !Array.isArray(schema.oneOf) + ) { + return { + ...tool, + parameters: cleanSchemaForGemini({ ...schema, type: "object" }), + }; + } + + const variantKey = Array.isArray(schema.anyOf) + ? "anyOf" + : Array.isArray(schema.oneOf) + ? "oneOf" + : null; + if (!variantKey) return tool; + const variants = schema[variantKey] as unknown[]; const mergedProperties: Record = {}; const requiredCounts = new Map(); let objectVariants = 0; - for (const entry of schema.anyOf) { + for (const entry of variants) { if (!entry || typeof entry !== "object") continue; const props = (entry as { properties?: unknown }).properties; if (!props || typeof props !== "object") continue; @@ -277,9 +305,16 @@ function normalizeToolParameters(tool: AnyAgentTool): AnyAgentTool { const nextSchema: Record = { ...schema }; return { ...tool, + // Flatten union schemas into a single object schema: + // - Gemini doesn't allow top-level `type` together with `anyOf`. + // - OpenAI rejects schemas without top-level `type: "object"`. + // Merging properties preserves useful enums like `action` while keeping schemas portable. parameters: cleanSchemaForGemini({ - ...nextSchema, - type: nextSchema.type ?? "object", + type: "object", + ...(typeof nextSchema.title === "string" ? { title: nextSchema.title } : {}), + ...(typeof nextSchema.description === "string" + ? { description: nextSchema.description } + : {}), properties: Object.keys(mergedProperties).length > 0 ? mergedProperties @@ -518,5 +553,7 @@ export function createClawdbotCodingTools(options?: { const sandboxed = sandbox ? filterToolsByPolicy(globallyFiltered, sandbox.tools) : globallyFiltered; + // Always normalize tool JSON Schemas before handing them to pi-agent/pi-ai. + // Without this, some providers (notably OpenAI) will reject root-level union schemas. return sandboxed.map(normalizeToolParameters); } diff --git a/src/agents/tools/browser-tool.ts b/src/agents/tools/browser-tool.ts index c5fd871a2..882c65a33 100644 --- a/src/agents/tools/browser-tool.ts +++ b/src/agents/tools/browser-tool.ts @@ -97,99 +97,44 @@ const BrowserActSchema = Type.Union([ }), ]); -const BrowserToolSchema = Type.Union([ - Type.Object({ - action: Type.Literal("status"), - controlUrl: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("start"), - controlUrl: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("stop"), - controlUrl: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("tabs"), - controlUrl: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("open"), - controlUrl: Type.Optional(Type.String()), - targetUrl: Type.String(), - }), - Type.Object({ - action: Type.Literal("focus"), - controlUrl: Type.Optional(Type.String()), - targetId: Type.String(), - }), - Type.Object({ - action: Type.Literal("close"), - controlUrl: Type.Optional(Type.String()), - targetId: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("snapshot"), - controlUrl: Type.Optional(Type.String()), - format: Type.Optional( - Type.Union([Type.Literal("aria"), Type.Literal("ai")]), - ), - targetId: Type.Optional(Type.String()), - limit: Type.Optional(Type.Number()), - }), - Type.Object({ - action: Type.Literal("screenshot"), - controlUrl: Type.Optional(Type.String()), - targetId: Type.Optional(Type.String()), - fullPage: Type.Optional(Type.Boolean()), - ref: Type.Optional(Type.String()), - element: Type.Optional(Type.String()), - type: Type.Optional( - Type.Union([Type.Literal("png"), Type.Literal("jpeg")]), - ), - }), - Type.Object({ - action: Type.Literal("navigate"), - controlUrl: Type.Optional(Type.String()), - targetUrl: Type.String(), - targetId: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("console"), - controlUrl: Type.Optional(Type.String()), - level: Type.Optional(Type.String()), - targetId: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("pdf"), - controlUrl: Type.Optional(Type.String()), - targetId: Type.Optional(Type.String()), - }), - Type.Object({ - action: Type.Literal("upload"), - controlUrl: Type.Optional(Type.String()), - paths: Type.Array(Type.String()), - ref: Type.Optional(Type.String()), - inputRef: Type.Optional(Type.String()), - element: Type.Optional(Type.String()), - targetId: Type.Optional(Type.String()), - timeoutMs: Type.Optional(Type.Number()), - }), - Type.Object({ - action: Type.Literal("dialog"), - controlUrl: Type.Optional(Type.String()), - accept: Type.Boolean(), - promptText: Type.Optional(Type.String()), - targetId: Type.Optional(Type.String()), - timeoutMs: Type.Optional(Type.Number()), - }), - Type.Object({ - action: Type.Literal("act"), - controlUrl: Type.Optional(Type.String()), - request: BrowserActSchema, - }), -]); +// IMPORTANT: OpenAI function tool schemas must have a top-level `type: "object"`. +// A root-level `Type.Union([...])` compiles to `{ anyOf: [...] }` (no `type`), +// which OpenAI rejects ("Invalid schema ... type: None"). Keep this schema an object. +const BrowserToolSchema = Type.Object({ + action: Type.Union([ + Type.Literal("status"), + Type.Literal("start"), + Type.Literal("stop"), + Type.Literal("tabs"), + Type.Literal("open"), + Type.Literal("focus"), + Type.Literal("close"), + Type.Literal("snapshot"), + Type.Literal("screenshot"), + Type.Literal("navigate"), + Type.Literal("console"), + Type.Literal("pdf"), + Type.Literal("upload"), + Type.Literal("dialog"), + Type.Literal("act"), + ]), + controlUrl: Type.Optional(Type.String()), + targetUrl: Type.Optional(Type.String()), + targetId: Type.Optional(Type.String()), + limit: Type.Optional(Type.Number()), + format: Type.Optional(Type.Union([Type.Literal("aria"), Type.Literal("ai")])), + fullPage: Type.Optional(Type.Boolean()), + ref: Type.Optional(Type.String()), + element: Type.Optional(Type.String()), + type: Type.Optional(Type.Union([Type.Literal("png"), Type.Literal("jpeg")])), + level: Type.Optional(Type.String()), + paths: Type.Optional(Type.Array(Type.String())), + inputRef: Type.Optional(Type.String()), + timeoutMs: Type.Optional(Type.Number()), + accept: Type.Optional(Type.Boolean()), + promptText: Type.Optional(Type.String()), + request: Type.Optional(BrowserActSchema), +}); function resolveBrowserBaseUrl(controlUrl?: string) { const cfg = loadConfig();