diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ba0531f6..ecd69a823 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,7 @@ - CLI/Update: preserve base environment when passing overrides to update subprocesses. (#713) — thanks @danielz1z. - Agents: treat message tool errors as failures so fallback replies still send; require `to` + `message` for `action=send`. (#717) — thanks @theglove44. - Agents: preserve reasoning items on tool-only turns. +- Agents: enforce `` gating for reasoning-tag providers to prevent tag/reasoning leaks. (#754) — thanks @mcinteerj. - Agents/Subagents: wait for completion before announcing, align wait timeout with run timeout, and make announce prompts more emphatic. - Agents: route subagent transcripts to the target agent sessions directory and add regression coverage. (#708) — thanks @xMikeMickelson. - Agents/Tools: preserve action enums when flattening tool schemas. (#708) — thanks @xMikeMickelson. diff --git a/src/agents/pi-embedded-subscribe.test.ts b/src/agents/pi-embedded-subscribe.test.ts index 2b8afa6fb..066e8d124 100644 --- a/src/agents/pi-embedded-subscribe.test.ts +++ b/src/agents/pi-embedded-subscribe.test.ts @@ -16,7 +16,7 @@ describe("subscribeEmbeddedPiSession", () => { { tag: "thought", open: "", close: "" }, { tag: "antthinking", open: "", close: "" }, ] as const; - it("filters to and falls back when tags are malformed", () => { + it("filters to and suppresses output without a start tag", () => { let handler: ((evt: unknown) => void) | undefined; const session: StubSession = { subscribe: (fn) => { @@ -38,6 +38,7 @@ describe("subscribeEmbeddedPiSession", () => { onAgentEvent, }); + handler?.({ type: "message_start", message: { role: "assistant" } }); handler?.({ type: "message_update", message: { role: "assistant" }, @@ -53,11 +54,7 @@ describe("subscribeEmbeddedPiSession", () => { onPartialReply.mockReset(); - handler?.({ - type: "message_end", - message: { role: "assistant" }, - }); - + handler?.({ type: "message_start", message: { role: "assistant" } }); handler?.({ type: "message_update", message: { role: "assistant" }, @@ -67,8 +64,7 @@ describe("subscribeEmbeddedPiSession", () => { }, }); - const secondPayload = onPartialReply.mock.calls[0][0]; - expect(secondPayload.text).toContain("Oops no start"); + expect(onPartialReply).not.toHaveBeenCalled(); }); it("does not require when enforcement is off", () => { diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 209936806..a9b6a9284 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -36,9 +36,6 @@ import { promoteThinkingTagsToBlocks, } from "./pi-embedded-utils.js"; -const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi; -const THINKING_OPEN_RE = /<\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; -const THINKING_CLOSE_RE = /<\s*\/\s*(?:think(?:ing)?|thought|antthinking)\s*>/i; const THINKING_TAG_SCAN_RE = /<\s*(\/?)\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi; const FINAL_TAG_SCAN_RE = /<\s*(\/?)\s*final\s*>/gi; @@ -182,11 +179,6 @@ export function subscribeEmbeddedPiSession(params: { }) => void; enforceFinalTag?: boolean; }) { - if (params.enforceFinalTag) { - log.debug("subscribeEmbeddedPiSession: enforceFinalTag is ENABLED"); - } else { - log.debug("subscribeEmbeddedPiSession: enforceFinalTag is DISABLED"); - } const assistantTexts: string[] = []; const toolMetas: Array<{ toolName?: string; meta?: string }> = []; const toolMetaById = new Map(); @@ -202,7 +194,6 @@ export function subscribeEmbeddedPiSession(params: { let blockBuffer = ""; // Track if a streamed chunk opened a block (stateful across chunks). const blockState = { thinking: false, final: false }; - const streamState = { thinking: false, final: false }; let lastStreamedAssistant: string | undefined; let lastStreamedReasoning: string | undefined; let lastBlockReplyText: string | undefined; @@ -220,8 +211,6 @@ export function subscribeEmbeddedPiSession(params: { blockChunker?.reset(); blockState.thinking = false; blockState.final = false; - streamState.thinking = false; - streamState.final = false; lastStreamedAssistant = undefined; lastBlockReplyText = undefined; lastStreamedReasoning = undefined; @@ -373,6 +362,7 @@ export function subscribeEmbeddedPiSession(params: { // hallucinations (e.g. Minimax copying the style) from leaking, but we // do not enforce buffering/extraction logic. if (!params.enforceFinalTag) { + FINAL_TAG_SCAN_RE.lastIndex = 0; return processed.replace(FINAL_TAG_SCAN_RE, ""); } @@ -405,19 +395,6 @@ export function subscribeEmbeddedPiSession(params: { } state.final = inFinal; - // Log the result of the stripping for debugging purposes - if (params.enforceFinalTag && (everInFinal || processed.length > 0)) { - log.debug(JSON.stringify({ - raw: processed.slice(0, 100), - stripped: result.slice(0, 100), - inFinal, - everInFinal, - rawLen: processed.length, - strippedLen: result.length, - tag: "DEBUG_STRIP" - })); - } - // Strict Mode: If enforcing final tags, we MUST NOT return content unless // we have seen a tag. Otherwise, we leak "thinking out loud" text // (e.g. "**Locating Manulife**...") that the model emitted without tags. @@ -936,8 +913,6 @@ export function subscribeEmbeddedPiSession(params: { blockChunker?.reset(); blockState.thinking = false; blockState.final = false; - streamState.thinking = false; - streamState.final = false; lastStreamedAssistant = undefined; } } @@ -1021,8 +996,6 @@ export function subscribeEmbeddedPiSession(params: { } blockState.thinking = false; blockState.final = false; - streamState.thinking = false; - streamState.final = false; if (pendingCompactionRetry > 0) { resolveCompactionRetry(); } else { diff --git a/src/auto-reply/reply.ts b/src/auto-reply/reply.ts index 074e8fe4c..0c81b8d88 100644 --- a/src/auto-reply/reply.ts +++ b/src/auto-reply/reply.ts @@ -1156,9 +1156,6 @@ export async function getReplyFromConfig( resolvedQueue.mode === "collect" || resolvedQueue.mode === "steer-backlog"; const authProfileId = sessionEntry?.authProfileOverride; - // DEBUG: Check provider for reasoning tag - const shouldEnforce = isReasoningTagProvider(provider); - // defaultRuntime.log(`[DEBUG] reply.ts: provider='${provider}', isReasoningTagProvider=${shouldEnforce}`); const followupRun = { prompt: queuedBody, @@ -1198,15 +1195,7 @@ export async function getReplyFromConfig( ownerNumbers: command.ownerList.length > 0 ? command.ownerList : undefined, extraSystemPrompt: extraSystemPrompt || undefined, - ...(isReasoningTagProvider(provider) - ? (() => { - logVerbose(`[reply] Enforcing final tag for provider: ${provider}`); - return { enforceFinalTag: true }; - })() - : (() => { - logVerbose(`[reply] NOT enforcing final tag for provider: ${provider}`); - return {}; - })()), + ...(isReasoningTagProvider(provider) ? { enforceFinalTag: true } : {}), }, }; diff --git a/src/utils/provider-utils.ts b/src/utils/provider-utils.ts index 841b60938..900b0e681 100644 --- a/src/utils/provider-utils.ts +++ b/src/utils/provider-utils.ts @@ -7,10 +7,12 @@ * (e.g. and ) in the text stream, rather than using native * API fields for reasoning/thinking. */ -export function isReasoningTagProvider(provider: string | undefined | null): boolean { +export function isReasoningTagProvider( + provider: string | undefined | null, +): boolean { if (!provider) return false; const normalized = provider.trim().toLowerCase(); - + // Check for exact matches or known prefixes/substrings for reasoning providers if ( normalized === "ollama" || @@ -19,7 +21,7 @@ export function isReasoningTagProvider(provider: string | undefined | null): boo ) { return true; } - + // Handle google-antigravity and its model variations (e.g. google-antigravity/gemini-3) if (normalized.includes("google-antigravity")) { return true; @@ -29,6 +31,6 @@ export function isReasoningTagProvider(provider: string | undefined | null): boo if (normalized.includes("minimax")) { return true; } - + return false; }