fix: enforce final tag gating (#754) (thanks @mcinteerj)

2026-01-12 08:45:23 +00:00
parent a7cb270999
commit 252841ab13
5 changed files with 13 additions and 52 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -93,6 +93,7 @@
 - CLI/Update: preserve base environment when passing overrides to update subprocesses. (#713) — thanks @danielz1z.
 - Agents: treat message tool errors as failures so fallback replies still send; require `to` + `message` for `action=send`. (#717) — thanks @theglove44.
 - Agents: preserve reasoning items on tool-only turns.
 - Agents: enforce `<final>` gating for reasoning-tag providers to prevent tag/reasoning leaks. (#754) — thanks @mcinteerj.
 - Agents/Subagents: wait for completion before announcing, align wait timeout with run timeout, and make announce prompts more emphatic.
 - Agents: route subagent transcripts to the target agent sessions directory and add regression coverage. (#708) — thanks @xMikeMickelson.
 - Agents/Tools: preserve action enums when flattening tool schemas. (#708) — thanks @xMikeMickelson.
--- a/src/agents/pi-embedded-subscribe.test.ts
+++ b/src/agents/pi-embedded-subscribe.test.ts
@@ -16,7 +16,7 @@ describe("subscribeEmbeddedPiSession", () => {
    { tag: "thought", open: "<thought>", close: "</thought>" },
    { tag: "antthinking", open: "<antthinking>", close: "</antthinking>" },
  ] as const;
-  it("filters to <final> and falls back when tags are malformed", () => {
+  it("filters to <final> and suppresses output without a start tag", () => {
    let handler: ((evt: unknown) => void) | undefined;
    const session: StubSession = {
      subscribe: (fn) => {
@@ -38,6 +38,7 @@ describe("subscribeEmbeddedPiSession", () => {
      onAgentEvent,
    });
    handler?.({ type: "message_start", message: { role: "assistant" } });
    handler?.({
      type: "message_update",
      message: { role: "assistant" },
@@ -53,11 +54,7 @@ describe("subscribeEmbeddedPiSession", () => {
    onPartialReply.mockReset();
-    handler?.({
+    handler?.({ type: "message_start", message: { role: "assistant" } });
      type: "message_end",
      message: { role: "assistant" },
    });
    handler?.({
      type: "message_update",
      message: { role: "assistant" },
@@ -67,8 +64,7 @@ describe("subscribeEmbeddedPiSession", () => {
      },
    });
-    const secondPayload = onPartialReply.mock.calls[0][0];
+    expect(onPartialReply).not.toHaveBeenCalled();
    expect(secondPayload.text).toContain("Oops no start");
  });
  it("does not require <final> when enforcement is off", () => {
--- a/src/agents/pi-embedded-subscribe.ts
+++ b/src/agents/pi-embedded-subscribe.ts
@@ -36,9 +36,6 @@ import {
  promoteThinkingTagsToBlocks,
 } from "./pi-embedded-utils.js";
 const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi;
 const THINKING_OPEN_RE = /<\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
 const THINKING_CLOSE_RE = /<\s*\/\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
 const THINKING_TAG_SCAN_RE =
  /<\s*(\/?)\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi;
 const FINAL_TAG_SCAN_RE = /<\s*(\/?)\s*final\s*>/gi;
@@ -182,11 +179,6 @@ export function subscribeEmbeddedPiSession(params: {
  }) => void;
  enforceFinalTag?: boolean;
 }) {
  if (params.enforceFinalTag) {
    log.debug("subscribeEmbeddedPiSession: enforceFinalTag is ENABLED");
  } else {
    log.debug("subscribeEmbeddedPiSession: enforceFinalTag is DISABLED");
  }
  const assistantTexts: string[] = [];
  const toolMetas: Array<{ toolName?: string; meta?: string }> = [];
  const toolMetaById = new Map<string, string | undefined>();
@@ -202,7 +194,6 @@ export function subscribeEmbeddedPiSession(params: {
  let blockBuffer = "";
  // Track if a streamed chunk opened a <think> block (stateful across chunks).
  const blockState = { thinking: false, final: false };
  const streamState = { thinking: false, final: false };
  let lastStreamedAssistant: string | undefined;
  let lastStreamedReasoning: string | undefined;
  let lastBlockReplyText: string | undefined;
@@ -220,8 +211,6 @@ export function subscribeEmbeddedPiSession(params: {
    blockChunker?.reset();
    blockState.thinking = false;
    blockState.final = false;
    streamState.thinking = false;
    streamState.final = false;
    lastStreamedAssistant = undefined;
    lastBlockReplyText = undefined;
    lastStreamedReasoning = undefined;
@@ -373,6 +362,7 @@ export function subscribeEmbeddedPiSession(params: {
    // hallucinations (e.g. Minimax copying the style) from leaking, but we
    // do not enforce buffering/extraction logic.
    if (!params.enforceFinalTag) {
      FINAL_TAG_SCAN_RE.lastIndex = 0;
      return processed.replace(FINAL_TAG_SCAN_RE, "");
    }
@@ -405,19 +395,6 @@ export function subscribeEmbeddedPiSession(params: {
    }
    state.final = inFinal;
    // Log the result of the stripping for debugging purposes
    if (params.enforceFinalTag && (everInFinal || processed.length > 0)) {
        log.debug(JSON.stringify({
            raw: processed.slice(0, 100),
            stripped: result.slice(0, 100),
            inFinal,
            everInFinal,
            rawLen: processed.length,
            strippedLen: result.length,
            tag: "DEBUG_STRIP"
        }));
    }
    // Strict Mode: If enforcing final tags, we MUST NOT return content unless
    // we have seen a <final> tag. Otherwise, we leak "thinking out loud" text
    // (e.g. "**Locating Manulife**...") that the model emitted without <think> tags.
@@ -936,8 +913,6 @@ export function subscribeEmbeddedPiSession(params: {
          blockChunker?.reset();
          blockState.thinking = false;
          blockState.final = false;
          streamState.thinking = false;
          streamState.final = false;
          lastStreamedAssistant = undefined;
        }
      }
@@ -1021,8 +996,6 @@ export function subscribeEmbeddedPiSession(params: {
        }
        blockState.thinking = false;
        blockState.final = false;
        streamState.thinking = false;
        streamState.final = false;
        if (pendingCompactionRetry > 0) {
          resolveCompactionRetry();
        } else {
--- a/src/auto-reply/reply.ts
+++ b/src/auto-reply/reply.ts
@@ -1156,9 +1156,6 @@ export async function getReplyFromConfig(
    resolvedQueue.mode === "collect" ||
    resolvedQueue.mode === "steer-backlog";
  const authProfileId = sessionEntry?.authProfileOverride;
  // DEBUG: Check provider for reasoning tag
  const shouldEnforce = isReasoningTagProvider(provider);
  // defaultRuntime.log(`[DEBUG] reply.ts: provider='${provider}', isReasoningTagProvider=${shouldEnforce}`);
  const followupRun = {
    prompt: queuedBody,
@@ -1198,15 +1195,7 @@ export async function getReplyFromConfig(
      ownerNumbers:
        command.ownerList.length > 0 ? command.ownerList : undefined,
      extraSystemPrompt: extraSystemPrompt || undefined,
-      ...(isReasoningTagProvider(provider)
+      ...(isReasoningTagProvider(provider) ? { enforceFinalTag: true } : {}),
        ? (() => {
            logVerbose(`[reply] Enforcing final tag for provider: ${provider}`);
            return { enforceFinalTag: true };
          })()
        : (() => {
            logVerbose(`[reply] NOT enforcing final tag for provider: ${provider}`);
            return {};
          })()),
    },
  };
--- a/src/utils/provider-utils.ts
+++ b/src/utils/provider-utils.ts
@@ -7,10 +7,12 @@
 * (e.g. <think> and <final>) in the text stream, rather than using native
 * API fields for reasoning/thinking.
 */
-export function isReasoningTagProvider(provider: string | undefined | null): boolean {
+export function isReasoningTagProvider(
  provider: string | undefined | null,
 ): boolean {
  if (!provider) return false;
  const normalized = provider.trim().toLowerCase();
-  
+
  // Check for exact matches or known prefixes/substrings for reasoning providers
  if (
    normalized === "ollama" ||
@@ -19,7 +21,7 @@ export function isReasoningTagProvider(provider: string | undefined | null): boo
  ) {
    return true;
  }
-  
+
  // Handle google-antigravity and its model variations (e.g. google-antigravity/gemini-3)
  if (normalized.includes("google-antigravity")) {
    return true;
@@ -29,6 +31,6 @@ export function isReasoningTagProvider(provider: string | undefined | null): boo
  if (normalized.includes("minimax")) {
    return true;
  }
-  
+
  return false;
 }