diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ba0531f6..ecd69a823 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,7 @@ - CLI/Update: preserve base environment when passing overrides to update subprocesses. (#713) — thanks @danielz1z. - Agents: treat message tool errors as failures so fallback replies still send; require `to` + `message` for `action=send`. (#717) — thanks @theglove44. - Agents: preserve reasoning items on tool-only turns. +- Agents: enforce `` gating for reasoning-tag providers to prevent tag/reasoning leaks. (#754) — thanks @mcinteerj. - Agents/Subagents: wait for completion before announcing, align wait timeout with run timeout, and make announce prompts more emphatic. - Agents: route subagent transcripts to the target agent sessions directory and add regression coverage. (#708) — thanks @xMikeMickelson. - Agents/Tools: preserve action enums when flattening tool schemas. (#708) — thanks @xMikeMickelson. diff --git a/src/agents/pi-embedded-helpers.ts b/src/agents/pi-embedded-helpers.ts index 7b24dc732..4d7539937 100644 --- a/src/agents/pi-embedded-helpers.ts +++ b/src/agents/pi-embedded-helpers.ts @@ -183,7 +183,11 @@ export async function sanitizeSessionMessagesImages( const GOOGLE_TURN_ORDER_BOOTSTRAP_TEXT = "(session bootstrap)"; export function isGoogleModelApi(api?: string | null): boolean { - return api === "google-gemini-cli" || api === "google-generative-ai"; + return ( + api === "google-gemini-cli" || + api === "google-generative-ai" || + api === "google-antigravity" + ); } export function sanitizeGoogleTurnOrdering( diff --git a/src/agents/pi-embedded-runner.ts b/src/agents/pi-embedded-runner.ts index 49ee41a4b..d985b2595 100644 --- a/src/agents/pi-embedded-runner.ts +++ b/src/agents/pi-embedded-runner.ts @@ -43,6 +43,7 @@ import { enqueueCommandInLane, } from "../process/command-queue.js"; import { normalizeMessageProvider } from "../utils/message-provider.js"; +import { isReasoningTagProvider } from "../utils/provider-utils.js"; import { resolveUserPath } from "../utils.js"; import { resolveClawdbotAgentDir } from "./agent-paths.js"; import { resolveSessionAgentIds } from "./agent-scope.js"; @@ -1141,7 +1142,7 @@ export async function compactEmbeddedPiSession(params: { sandbox, params.bashElevated, ); - const reasoningTagHint = provider === "ollama"; + const reasoningTagHint = isReasoningTagProvider(provider); const userTimezone = resolveUserTimezone( params.config?.agents?.defaults?.userTimezone, ); @@ -1547,7 +1548,7 @@ export async function runEmbeddedPiAgent(params: { sandbox, params.bashElevated, ); - const reasoningTagHint = provider === "ollama"; + const reasoningTagHint = isReasoningTagProvider(provider); const userTimezone = resolveUserTimezone( params.config?.agents?.defaults?.userTimezone, ); diff --git a/src/agents/pi-embedded-subscribe.test.ts b/src/agents/pi-embedded-subscribe.test.ts index 2b8afa6fb..066e8d124 100644 --- a/src/agents/pi-embedded-subscribe.test.ts +++ b/src/agents/pi-embedded-subscribe.test.ts @@ -16,7 +16,7 @@ describe("subscribeEmbeddedPiSession", () => { { tag: "thought", open: "", close: "" }, { tag: "antthinking", open: "", close: "" }, ] as const; - it("filters to and falls back when tags are malformed", () => { + it("filters to and suppresses output without a start tag", () => { let handler: ((evt: unknown) => void) | undefined; const session: StubSession = { subscribe: (fn) => { @@ -38,6 +38,7 @@ describe("subscribeEmbeddedPiSession", () => { onAgentEvent, }); + handler?.({ type: "message_start", message: { role: "assistant" } }); handler?.({ type: "message_update", message: { role: "assistant" }, @@ -53,11 +54,7 @@ describe("subscribeEmbeddedPiSession", () => { onPartialReply.mockReset(); - handler?.({ - type: "message_end", - message: { role: "assistant" }, - }); - + handler?.({ type: "message_start", message: { role: "assistant" } }); handler?.({ type: "message_update", message: { role: "assistant" }, @@ -67,8 +64,7 @@ describe("subscribeEmbeddedPiSession", () => { }, }); - const secondPayload = onPartialReply.mock.calls[0][0]; - expect(secondPayload.text).toContain("Oops no start"); + expect(onPartialReply).not.toHaveBeenCalled(); }); it("does not require when enforcement is off", () => { diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 9b9b04f68..a9b6a9284 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -36,11 +36,9 @@ import { promoteThinkingTagsToBlocks, } from "./pi-embedded-utils.js"; -const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi; -const THINKING_OPEN_RE = /<\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; -const THINKING_CLOSE_RE = /<\s*\/\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; const THINKING_TAG_SCAN_RE = /<\s*(\/?)\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi; +const FINAL_TAG_SCAN_RE = /<\s*(\/?)\s*final\s*>/gi; const TOOL_RESULT_MAX_CHARS = 8000; const log = createSubsystemLogger("agent/embedded"); const RAW_STREAM_ENABLED = process.env.CLAWDBOT_RAW_STREAM === "1"; @@ -111,37 +109,6 @@ function isToolResultError(result: unknown): boolean { return normalized === "error" || normalized === "timeout"; } -function stripThinkingSegments(text: string): string { - if (!text || !THINKING_TAG_RE.test(text)) return text; - THINKING_TAG_RE.lastIndex = 0; - let result = ""; - let lastIndex = 0; - let inThinking = false; - for (const match of text.matchAll(THINKING_TAG_RE)) { - const idx = match.index ?? 0; - if (!inThinking) { - result += text.slice(lastIndex, idx); - } - const tag = match[0].toLowerCase(); - inThinking = !tag.includes("/"); - lastIndex = idx + match[0].length; - } - if (!inThinking) { - result += text.slice(lastIndex); - } - return result; -} - -function stripUnpairedThinkingTags(text: string): string { - if (!text) return text; - const hasOpen = THINKING_OPEN_RE.test(text); - const hasClose = THINKING_CLOSE_RE.test(text); - if (hasOpen && hasClose) return text; - if (!hasOpen) return text.replace(THINKING_CLOSE_RE, ""); - if (!hasClose) return text.replace(THINKING_OPEN_RE, ""); - return text; -} - function extractMessagingToolSend( toolName: string, args: Record, @@ -226,7 +193,7 @@ export function subscribeEmbeddedPiSession(params: { let deltaBuffer = ""; let blockBuffer = ""; // Track if a streamed chunk opened a block (stateful across chunks). - let blockThinkingActive = false; + const blockState = { thinking: false, final: false }; let lastStreamedAssistant: string | undefined; let lastStreamedReasoning: string | undefined; let lastBlockReplyText: string | undefined; @@ -242,7 +209,8 @@ export function subscribeEmbeddedPiSession(params: { deltaBuffer = ""; blockBuffer = ""; blockChunker?.reset(); - blockThinkingActive = false; + blockState.thinking = false; + blockState.final = false; lastStreamedAssistant = undefined; lastBlockReplyText = undefined; lastStreamedReasoning = undefined; @@ -337,27 +305,6 @@ export function subscribeEmbeddedPiSession(params: { compactionRetryPromise = null; } }; - const FINAL_START_RE = /<\s*final\s*>/i; - const FINAL_END_RE = /<\s*\/\s*final\s*>/i; - // Local providers sometimes emit malformed tags; normalize before filtering. - const sanitizeFinalText = (text: string): string => { - if (!text) return text; - const hasStart = FINAL_START_RE.test(text); - const hasEnd = FINAL_END_RE.test(text); - if (hasStart && !hasEnd) return text.replace(FINAL_START_RE, ""); - if (!hasStart && hasEnd) return text.replace(FINAL_END_RE, ""); - return text; - }; - const extractFinalText = (text: string): string | undefined => { - const cleaned = sanitizeFinalText(text); - const startMatch = FINAL_START_RE.exec(cleaned); - if (!startMatch) return undefined; - const startIndex = startMatch.index + startMatch[0].length; - const afterStart = cleaned.slice(startIndex); - const endMatch = FINAL_END_RE.exec(afterStart); - const endIndex = endMatch ? endMatch.index : afterStart.length; - return afterStart.slice(0, endIndex); - }; const blockChunking = params.blockReplyChunking; const blockChunker = blockChunking @@ -385,34 +332,85 @@ export function subscribeEmbeddedPiSession(params: { } }; - const stripBlockThinkingSegments = (text: string): string => { + const stripBlockTags = ( + text: string, + state: { thinking: boolean; final: boolean }, + ): string => { if (!text) return text; - if (!blockThinkingActive && !THINKING_TAG_SCAN_RE.test(text)) return text; + + // 1. Handle blocks (stateful, strip content inside) + let processed = ""; THINKING_TAG_SCAN_RE.lastIndex = 0; - let result = ""; let lastIndex = 0; - let inThinking = blockThinkingActive; + let inThinking = state.thinking; for (const match of text.matchAll(THINKING_TAG_SCAN_RE)) { const idx = match.index ?? 0; if (!inThinking) { - result += text.slice(lastIndex, idx); + processed += text.slice(lastIndex, idx); } const isClose = match[1] === "/"; inThinking = !isClose; lastIndex = idx + match[0].length; } if (!inThinking) { - result += text.slice(lastIndex); + processed += text.slice(lastIndex); } - blockThinkingActive = inThinking; - return result; + state.thinking = inThinking; + + // 2. Handle blocks (stateful, strip content OUTSIDE) + // If enforcement is disabled, we still strip the tags themselves to prevent + // hallucinations (e.g. Minimax copying the style) from leaking, but we + // do not enforce buffering/extraction logic. + if (!params.enforceFinalTag) { + FINAL_TAG_SCAN_RE.lastIndex = 0; + return processed.replace(FINAL_TAG_SCAN_RE, ""); + } + + // If enforcement is enabled, only return text that appeared inside a block. + let result = ""; + FINAL_TAG_SCAN_RE.lastIndex = 0; + let lastFinalIndex = 0; + let inFinal = state.final; + let everInFinal = state.final; + + for (const match of processed.matchAll(FINAL_TAG_SCAN_RE)) { + const idx = match.index ?? 0; + const isClose = match[1] === "/"; + + if (!inFinal && !isClose) { + // Found start tag. + inFinal = true; + everInFinal = true; + lastFinalIndex = idx + match[0].length; + } else if (inFinal && isClose) { + // Found end tag. + result += processed.slice(lastFinalIndex, idx); + inFinal = false; + lastFinalIndex = idx + match[0].length; + } + } + + if (inFinal) { + result += processed.slice(lastFinalIndex); + } + state.final = inFinal; + + // Strict Mode: If enforcing final tags, we MUST NOT return content unless + // we have seen a tag. Otherwise, we leak "thinking out loud" text + // (e.g. "**Locating Manulife**...") that the model emitted without tags. + if (!everInFinal) { + return ""; + } + + // Hardened Cleanup: Remove any remaining tags that might have been + // missed (e.g. nested tags or hallucinations) to prevent leakage. + return result.replace(FINAL_TAG_SCAN_RE, ""); }; const emitBlockChunk = (text: string) => { if (suppressBlockChunks) return; - // Strip blocks across chunk boundaries to avoid leaking reasoning. - const strippedText = stripBlockThinkingSegments(text); - const chunk = strippedText.trimEnd(); + // Strip and blocks across chunk boundaries to avoid leaking reasoning. + const chunk = stripBlockTags(text, blockState).trimEnd(); if (!chunk) return; if (chunk === lastBlockReplyText) return; @@ -754,12 +752,10 @@ export function subscribeEmbeddedPiSession(params: { emitReasoningStream(extractThinkingFromTaggedStream(deltaBuffer)); } - const cleaned = params.enforceFinalTag - ? stripThinkingSegments(stripUnpairedThinkingTags(deltaBuffer)) - : stripThinkingSegments(deltaBuffer); - const next = params.enforceFinalTag - ? (extractFinalText(cleaned)?.trim() ?? cleaned.trim()) - : cleaned.trim(); + const next = stripBlockTags(deltaBuffer, { + thinking: false, + final: false, + }).trim(); if (next && next !== lastStreamedAssistant) { lastStreamedAssistant = next; const { text: cleanedText, mediaUrls } = @@ -822,13 +818,10 @@ export function subscribeEmbeddedPiSession(params: { rawText, rawThinking: extractAssistantThinking(assistantMessage), }); - const cleaned = params.enforceFinalTag - ? stripThinkingSegments(stripUnpairedThinkingTags(rawText)) - : stripThinkingSegments(rawText); - const baseText = - params.enforceFinalTag && cleaned - ? (extractFinalText(cleaned)?.trim() ?? cleaned) - : cleaned; + const text = stripBlockTags(rawText, { + thinking: false, + final: false, + }); const rawThinking = includeReasoning || streamReasoning ? extractAssistantThinking(assistantMessage) || @@ -837,7 +830,6 @@ export function subscribeEmbeddedPiSession(params: { const formattedReasoning = rawThinking ? formatReasoningMessage(rawThinking) : ""; - const text = baseText; const addedDuringMessage = assistantTexts.length > assistantTextBaseline; @@ -919,7 +911,8 @@ export function subscribeEmbeddedPiSession(params: { deltaBuffer = ""; blockBuffer = ""; blockChunker?.reset(); - blockThinkingActive = false; + blockState.thinking = false; + blockState.final = false; lastStreamedAssistant = undefined; } } @@ -1001,7 +994,8 @@ export function subscribeEmbeddedPiSession(params: { blockBuffer = ""; } } - blockThinkingActive = false; + blockState.thinking = false; + blockState.final = false; if (pendingCompactionRetry > 0) { resolveCompactionRetry(); } else { diff --git a/src/auto-reply/reply.ts b/src/auto-reply/reply.ts index 024b31280..0c81b8d88 100644 --- a/src/auto-reply/reply.ts +++ b/src/auto-reply/reply.ts @@ -40,6 +40,7 @@ import { import { normalizeMainKey } from "../routing/session-key.js"; import { defaultRuntime } from "../runtime.js"; import { INTERNAL_MESSAGE_PROVIDER } from "../utils/message-provider.js"; +import { isReasoningTagProvider } from "../utils/provider-utils.js"; import { resolveCommandAuthorization } from "./command-auth.js"; import { hasControlCommand } from "./command-detection.js"; import { @@ -1155,6 +1156,7 @@ export async function getReplyFromConfig( resolvedQueue.mode === "collect" || resolvedQueue.mode === "steer-backlog"; const authProfileId = sessionEntry?.authProfileOverride; + const followupRun = { prompt: queuedBody, messageId: sessionCtx.MessageSid, @@ -1193,7 +1195,7 @@ export async function getReplyFromConfig( ownerNumbers: command.ownerList.length > 0 ? command.ownerList : undefined, extraSystemPrompt: extraSystemPrompt || undefined, - ...(provider === "ollama" ? { enforceFinalTag: true } : {}), + ...(isReasoningTagProvider(provider) ? { enforceFinalTag: true } : {}), }, }; diff --git a/src/utils/provider-utils.ts b/src/utils/provider-utils.ts new file mode 100644 index 000000000..900b0e681 --- /dev/null +++ b/src/utils/provider-utils.ts @@ -0,0 +1,36 @@ +/** + * Utility functions for provider-specific logic and capabilities. + */ + +/** + * Returns true if the provider requires reasoning to be wrapped in tags + * (e.g. and ) in the text stream, rather than using native + * API fields for reasoning/thinking. + */ +export function isReasoningTagProvider( + provider: string | undefined | null, +): boolean { + if (!provider) return false; + const normalized = provider.trim().toLowerCase(); + + // Check for exact matches or known prefixes/substrings for reasoning providers + if ( + normalized === "ollama" || + normalized === "google-gemini-cli" || + normalized === "google-generative-ai" + ) { + return true; + } + + // Handle google-antigravity and its model variations (e.g. google-antigravity/gemini-3) + if (normalized.includes("google-antigravity")) { + return true; + } + + // Handle Minimax (M2.1 is chatty/reasoning-like) + if (normalized.includes("minimax")) { + return true; + } + + return false; +}