feat(gateway): implement OpenResponses /v1/responses endpoint phase 2

- Add input_image and input_file support with SSRF protection
- Add client-side tools (Hosted Tools) support
- Add turn-based tool flow with function_call_output handling
- Export buildAgentPrompt for testing
This commit is contained in:
Ryan Lisse
2026-01-19 12:43:00 +01:00
committed by Peter Steinberger
parent f4b03599f0
commit a5afe7bc2b
12 changed files with 437 additions and 28 deletions

View File

@@ -90,7 +90,7 @@ export class GatewayClient {
};
if (url.startsWith("wss://") && this.opts.tlsFingerprint) {
wsOptions.rejectUnauthorized = false;
wsOptions.checkServerIdentity = (_host: string, cert: CertMeta) => {
wsOptions.checkServerIdentity = ((_host: string, cert: CertMeta) => {
const fingerprintValue =
typeof cert === "object" && cert && "fingerprint256" in cert
? ((cert as { fingerprint256?: string }).fingerprint256 ?? "")
@@ -99,9 +99,17 @@ export class GatewayClient {
typeof fingerprintValue === "string" ? fingerprintValue : "",
);
const expected = normalizeFingerprint(this.opts.tlsFingerprint ?? "");
if (!expected || !fingerprint) return false;
return fingerprint === expected;
};
if (!expected) {
return new Error("gateway tls fingerprint missing");
}
if (!fingerprint) {
return new Error("gateway tls fingerprint unavailable");
}
if (fingerprint !== expected) {
return new Error("gateway tls fingerprint mismatch");
}
return undefined;
}) as any;
}
this.ws = new WebSocket(url, wsOptions);

View File

@@ -27,18 +27,46 @@ export const OutputTextContentPartSchema = z
})
.strict();
// For Phase 1, we reject image/file content with helpful errors
// OpenResponses Image Content: Supports URL or base64 sources
export const InputImageSourceSchema = z.discriminatedUnion("type", [
z.object({
type: z.literal("url"),
url: z.string().url(),
}),
z.object({
type: z.literal("base64"),
media_type: z.enum(["image/jpeg", "image/png", "image/gif", "image/webp"]),
data: z.string().min(1), // base64-encoded
}),
]);
export const InputImageContentPartSchema = z
.object({
type: z.literal("input_image"),
source: InputImageSourceSchema,
})
.passthrough();
.strict();
// OpenResponses File Content: Supports URL or base64 sources
export const InputFileSourceSchema = z.discriminatedUnion("type", [
z.object({
type: z.literal("url"),
url: z.string().url(),
}),
z.object({
type: z.literal("base64"),
media_type: z.string().min(1), // MIME type
data: z.string().min(1), // base64-encoded
filename: z.string().optional(),
}),
]);
export const InputFileContentPartSchema = z
.object({
type: z.literal("input_file"),
source: InputFileSourceSchema,
})
.passthrough();
.strict();
export const ContentPartSchema = z.discriminatedUnion("type", [
InputTextContentPartSchema,
@@ -117,13 +145,14 @@ export const FunctionToolDefinitionSchema = z
.object({
type: z.literal("function"),
function: z.object({
name: z.string(),
name: z.string().min(1, "Tool name cannot be empty"),
description: z.string().optional(),
parameters: z.record(z.string(), z.unknown()).optional(),
}),
})
.strict();
// OpenResponses tool definitions match internal ToolDefinition structure
export const ToolDefinitionSchema = FunctionToolDefinitionSchema;
export type ToolDefinition = z.infer<typeof ToolDefinitionSchema>;

View File

@@ -27,6 +27,8 @@ import {
type StreamingEvent,
type Usage,
} from "./open-responses.schema.js";
import type { ClientToolDefinition } from "../agents/pi-embedded-runner/run/params.js";
import type { ImageContent } from "../commands/agent/types.js";
type OpenResponsesHttpOptions = {
auth: ResolvedGatewayAuth;
@@ -74,16 +76,157 @@ function extractTextContent(content: string | ContentPart[]): string {
.join("\n");
}
function hasUnsupportedContent(content: string | ContentPart[]): string | null {
if (typeof content === "string") return null;
for (const part of content) {
if (part.type === "input_image") return "input_image content is not supported in Phase 1";
if (part.type === "input_file") return "input_file content is not supported in Phase 1";
}
return null;
const PRIVATE_IP_PATTERNS = [
/^127\./, // Loopback
/^192\.168\./, // Private network
/^10\./, // Private network
/^172\.(1[6-9]|2[0-9]|3[0-1])\./, // Private network
/^::1$/, // IPv6 loopback
/^fe80:/, // IPv6 link-local
/^fec0:/, // IPv6 site-local
];
function isPrivateIp(hostname: string): boolean {
return PRIVATE_IP_PATTERNS.some((pattern) => pattern.test(hostname));
}
function buildAgentPrompt(input: string | ItemParam[]): {
// Fetch with SSRF protection, timeout, and size limits
async function fetchWithGuard(
url: string,
maxBytes: number,
timeoutMs: number = 10000,
): Promise<{ data: string; mimeType: string }> {
const parsedUrl = new URL(url);
// Only allow HTTP/HTTPS
if (!["http:", "https:"].includes(parsedUrl.protocol)) {
throw new Error(`Invalid URL protocol: ${parsedUrl.protocol}. Only HTTP/HTTPS allowed.`);
}
// Block private IPs (SSRF protection)
if (isPrivateIp(parsedUrl.hostname)) {
throw new Error(`Private IP addresses are not allowed: ${parsedUrl.hostname}`);
}
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await fetch(url, {
signal: controller.signal,
headers: { "User-Agent": "Clawdbot-Gateway/1.0" },
});
if (!response.ok) {
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
}
const contentLength = response.headers.get("content-length");
if (contentLength) {
const size = parseInt(contentLength, 10);
if (size > maxBytes) {
throw new Error(`Content too large: ${size} bytes (limit: ${maxBytes} bytes)`);
}
}
const buffer = await response.arrayBuffer();
if (buffer.byteLength > maxBytes) {
throw new Error(`Content too large: ${buffer.byteLength} bytes (limit: ${maxBytes} bytes)`);
}
const mimeType = response.headers.get("content-type") || "application/octet-stream";
return {
data: Buffer.from(buffer).toString("base64"),
mimeType,
};
} finally {
clearTimeout(timeoutId);
}
}
const ALLOWED_IMAGE_MIMES = new Set(["image/jpeg", "image/png", "image/gif", "image/webp"]);
const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10MB
const MAX_FILE_BYTES = 5 * 1024 * 1024; // 5MB
const ALLOWED_FILE_MIMES = new Set([
"text/plain",
"text/markdown",
"text/html",
"text/csv",
"application/pdf",
"application/json",
]);
async function extractImageContent(part: ContentPart): Promise<ImageContent | null> {
if (part.type !== "input_image") return null;
const source = part.source as { type: string; url?: string; data?: string; media_type?: string };
if (source.type === "base64") {
if (!source.data) {
throw new Error("input_image base64 source missing 'data' field");
}
const mimeType = source.media_type || "image/png";
if (!ALLOWED_IMAGE_MIMES.has(mimeType)) {
throw new Error(`Unsupported image MIME type: ${mimeType}`);
}
return { type: "image", data: source.data, mimeType };
}
if (source.type === "url" && source.url) {
const result = await fetchWithGuard(source.url, MAX_IMAGE_BYTES);
if (!ALLOWED_IMAGE_MIMES.has(result.mimeType)) {
throw new Error(`Unsupported image MIME type from URL: ${result.mimeType}`);
}
return { type: "image", data: result.data, mimeType: result.mimeType };
}
throw new Error("input_image must have 'source.url' or 'source.data'");
}
async function extractFileContent(part: ContentPart): Promise<string | null> {
if (part.type !== "input_file") return null;
const source = part.source as {
type: string;
url?: string;
data?: string;
media_type?: string;
filename?: string;
};
const filename = source.filename || "file";
let content: string;
if (source.type === "base64") {
if (!source.data) {
throw new Error("input_file base64 source missing 'data' field");
}
const buffer = Buffer.from(source.data, "base64");
if (buffer.byteLength > MAX_FILE_BYTES) {
throw new Error(
`File too large: ${buffer.byteLength} bytes (limit: ${MAX_FILE_BYTES} bytes)`,
);
}
content = buffer.toString("utf-8");
} else if (source.type === "url" && source.url) {
const result = await fetchWithGuard(source.url, MAX_FILE_BYTES);
if (!ALLOWED_FILE_MIMES.has(result.mimeType)) {
throw new Error(`Unsupported file MIME type: ${result.mimeType}`);
}
content = Buffer.from(result.data, "base64").toString("utf-8");
} else {
throw new Error("input_file must have 'source.url' or 'source.data'");
}
return `<file name="${filename}">\n${content}\n</file>`;
}
function extractClientTools(body: CreateResponseBody): ClientToolDefinition[] {
return (body.tools ?? []) as ClientToolDefinition[];
}
export function buildAgentPrompt(input: string | ItemParam[]): {
message: string;
extraSystemPrompt?: string;
} {
@@ -293,33 +436,44 @@ export async function handleOpenResponsesHttpRequest(
const model = payload.model;
const user = payload.user;
// Check for unsupported content types (Phase 1)
// Extract images, files, and tools from input (Phase 2)
let images: ImageContent[] = [];
let fileContents: string[] = [];
if (Array.isArray(payload.input)) {
for (const item of payload.input) {
if (item.type === "message" && typeof item.content !== "string") {
const unsupported = hasUnsupportedContent(item.content);
if (unsupported) {
sendJson(res, 400, {
error: { message: unsupported, type: "invalid_request_error" },
});
return true;
for (const part of item.content) {
const image = await extractImageContent(part);
if (image) {
images.push(image);
continue;
}
const file = await extractFileContent(part);
if (file) {
fileContents.push(file);
}
}
}
}
}
const clientTools = extractClientTools(payload);
const agentId = resolveAgentIdForRequest({ req, model });
const sessionKey = resolveSessionKey({ req, agentId, user });
// Build prompt from input
const prompt = buildAgentPrompt(payload.input);
// Append file contents to the message
const fullMessage =
fileContents.length > 0 ? `${prompt.message}\n\n${fileContents.join("\n\n")}` : prompt.message;
// Handle instructions as extra system prompt
const extraSystemPrompt = [payload.instructions, prompt.extraSystemPrompt]
.filter(Boolean)
.join("\n\n");
if (!prompt.message) {
if (!fullMessage) {
sendJson(res, 400, {
error: {
message: "Missing user message in `input`.",
@@ -337,7 +491,9 @@ export async function handleOpenResponsesHttpRequest(
try {
const result = await agentCommand(
{
message: prompt.message,
message: fullMessage,
images: images.length > 0 ? images : undefined,
clientTools: clientTools.length > 0 ? clientTools : undefined,
extraSystemPrompt: extraSystemPrompt || undefined,
sessionKey,
runId: responseId,
@@ -350,6 +506,36 @@ export async function handleOpenResponsesHttpRequest(
);
const payloads = (result as { payloads?: Array<{ text?: string }> } | null)?.payloads;
const meta = (result as { meta?: unknown } | null)?.meta;
const stopReason =
meta && typeof meta === "object" ? (meta as { stopReason?: string }).stopReason : undefined;
const pendingToolCalls =
meta && typeof meta === "object"
? (meta as { pendingToolCalls?: Array<{ id: string; name: string; arguments: string }> })
.pendingToolCalls
: undefined;
// If agent called a client tool, return function_call instead of text
if (stopReason === "tool_calls" && pendingToolCalls && pendingToolCalls.length > 0) {
const functionCall = pendingToolCalls[0];
const response = createResponseResource({
id: responseId,
model,
status: "incomplete",
output: [
{
type: "function_call",
id: functionCall.id,
call_id: functionCall.id,
name: functionCall.name,
arguments: functionCall.arguments,
},
],
});
sendJson(res, 200, response);
return true;
}
const content =
Array.isArray(payloads) && payloads.length > 0
? payloads
@@ -511,7 +697,9 @@ export async function handleOpenResponsesHttpRequest(
try {
const result = await agentCommand(
{
message: prompt.message,
message: fullMessage,
images: images.length > 0 ? images : undefined,
clientTools: clientTools.length > 0 ? clientTools : undefined,
extraSystemPrompt: extraSystemPrompt || undefined,
sessionKey,
runId: responseId,
@@ -527,7 +715,90 @@ export async function handleOpenResponsesHttpRequest(
// Fallback: if no streaming deltas were received, send the full response
if (!sawAssistantDelta) {
const payloads = (result as { payloads?: Array<{ text?: string }> } | null)?.payloads;
const resultAny = result as { payloads?: Array<{ text?: string }>; meta?: unknown };
const payloads = resultAny.payloads;
const meta = resultAny.meta;
const stopReason =
meta && typeof meta === "object"
? (meta as { stopReason?: string }).stopReason
: undefined;
const pendingToolCalls =
meta && typeof meta === "object"
? (
meta as {
pendingToolCalls?: Array<{ id: string; name: string; arguments: string }>;
}
).pendingToolCalls
: undefined;
// If agent called a client tool, emit function_call instead of text
if (stopReason === "tool_calls" && pendingToolCalls && pendingToolCalls.length > 0) {
const functionCall = pendingToolCalls[0];
// Complete the text content part
writeSseEvent(res, {
type: "response.output_text.done",
item_id: outputItemId,
output_index: 0,
content_index: 0,
text: "",
});
writeSseEvent(res, {
type: "response.content_part.done",
item_id: outputItemId,
output_index: 0,
content_index: 0,
part: { type: "output_text", text: "" },
});
// Complete the message item
const completedItem = createAssistantOutputItem({
id: outputItemId,
text: "",
status: "completed",
});
writeSseEvent(res, {
type: "response.output_item.done",
output_index: 0,
item: completedItem,
});
// Send function_call item
const functionCallItemId = `call_${randomUUID()}`;
const functionCallItem = {
type: "function_call" as const,
id: functionCallItemId,
call_id: functionCall.id,
name: functionCall.name,
arguments: functionCall.arguments,
};
writeSseEvent(res, {
type: "response.output_item.added",
output_index: 1,
item: functionCallItem,
});
writeSseEvent(res, {
type: "response.output_item.done",
output_index: 1,
item: { ...functionCallItem, status: "completed" as const },
});
writeSseEvent(res, {
type: "response.output_item.done",
output_index: 1,
item: { ...functionCallItem, status: "completed" as const },
});
const incompleteResponse = createResponseResource({
id: responseId,
model,
status: "incomplete",
output: [completedItem, functionCallItem],
});
writeSseEvent(res, { type: "response.completed", response: incompleteResponse });
writeDone(res);
res.end();
return;
}
const content =
Array.isArray(payloads) && payloads.length > 0
? payloads