fix(agents): skip thinking tags in code spans

2026-01-15 09:23:10 +00:00
parent aac5b4673f
commit 7e1e7ba2d8
7 changed files with 275 additions and 6 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -61,6 +61,22 @@
 #### Gateway / Daemon / Sessions
 - Gateway: forward termination signals to respawned CLI child processes to avoid orphaned systemd runs. (#933) — thanks @roshanasingh4.
 - Gateway/UI: ship session defaults in the hello snapshot so the Control UI canonicalizes main session keys (no bare `main` alias).
+- Agents: skip thinking/final tag stripping inside Markdown code spans. (#939) — thanks @ngutman.
+- Browser: add tests for snapshot labels/efficient query params and labeled image responses.
+- macOS: ensure launchd log directory exists with a test-only override. (#909) — thanks @roshanasingh4.
+- macOS: format ConnectionsStore config to satisfy SwiftFormat lint. (#852) — thanks @mneves75.
+- Packaging: run `pnpm build` on `prepack` so npm publishes include fresh `dist/` output.
+- Telegram: register dock native commands with underscores to avoid `BOT_COMMAND_INVALID` (#929, fixes #901) — thanks @grp06.
+- Google: downgrade unsigned thinking blocks before send to avoid missing signature errors.
+- Agents: make user time zone and 24-hour time explicit in the system prompt. (#859) — thanks @CashWilliams.
+- Agents: strip downgraded tool call text without eating adjacent replies and filter thinking-tag leaks. (#905) — thanks @erikpr1994.
+- Agents: cap tool call IDs for OpenAI/OpenRouter to avoid request rejections. (#875) — thanks @j1philli.
+- Doctor: avoid re-adding WhatsApp config when only legacy ack reactions are set. (#927, fixes #900) — thanks @grp06.
+- Agents: scrub tuple `items` schemas for Gemini tool calls. (#926, fixes #746) — thanks @grp06.
+- Agents: stabilize sub-agent announce status from runtime outcomes and normalize Result/Notes. (#835) — thanks @roshanasingh4.
+- Apps: use canonical main session keys from gateway defaults across macOS/iOS/Android to avoid creating bare `main` sessions.
+- Embedded runner: suppress raw API error payloads from replies. (#924) — thanks @grp06.
+- Auth: normalize Claude Code CLI profile mode to oauth and auto-migrate config. (#855) — thanks @sebslight.
 - Daemon: clear persisted launchd disabled state before bootstrap (fixes `daemon install` after uninstall). (#849) — thanks @ndraiman.
 - Sessions: return deep clones (`structuredClone`) so cached session entries can't be mutated. (#934) — thanks @ronak-guliani.
 - Heartbeat: keep `updatedAt` monotonic when restoring heartbeat sessions. (#934) — thanks @ronak-guliani.
--- a/src/agents/pi-embedded-subscribe.code-span-awareness.test.ts
+++ b/src/agents/pi-embedded-subscribe.code-span-awareness.test.ts
@@ -0,0 +1,103 @@
+import { describe, expect, it, vi } from "vitest";
+import { subscribeEmbeddedPiSession } from "./pi-embedded-subscribe.js";
+
+type StubSession = {
+  subscribe: (fn: (evt: unknown) => void) => () => void;
+};
+
+describe("subscribeEmbeddedPiSession thinking tag code span awareness", () => {
+  it("does not strip thinking tags inside inline code backticks", () => {
+    let handler: ((evt: unknown) => void) | undefined;
+    const session: StubSession = {
+      subscribe: (fn) => {
+        handler = fn;
+        return () => {};
+      },
+    };
+
+    const onPartialReply = vi.fn();
+
+    subscribeEmbeddedPiSession({
+      session: session as unknown as Parameters<typeof subscribeEmbeddedPiSession>[0]["session"],
+      runId: "run",
+      onPartialReply,
+    });
+
+    handler?.({
+      type: "message_update",
+      message: { role: "assistant" },
+      assistantMessageEvent: {
+        type: "text_delta",
+        delta: "The fix strips leaked `<thinking>` tags from messages.",
+      },
+    });
+
+    expect(onPartialReply).toHaveBeenCalled();
+    const lastCall = onPartialReply.mock.calls[onPartialReply.mock.calls.length - 1];
+    expect(lastCall[0].text).toContain("`<thinking>`");
+  });
+
+  it("does not strip thinking tags inside fenced code blocks", () => {
+    let handler: ((evt: unknown) => void) | undefined;
+    const session: StubSession = {
+      subscribe: (fn) => {
+        handler = fn;
+        return () => {};
+      },
+    };
+
+    const onPartialReply = vi.fn();
+
+    subscribeEmbeddedPiSession({
+      session: session as unknown as Parameters<typeof subscribeEmbeddedPiSession>[0]["session"],
+      runId: "run",
+      onPartialReply,
+    });
+
+    handler?.({
+      type: "message_update",
+      message: { role: "assistant" },
+      assistantMessageEvent: {
+        type: "text_delta",
+        delta: "Example:\n  ````\n<thinking>code example</thinking>\n  ````\nDone.",
+      },
+    });
+
+    expect(onPartialReply).toHaveBeenCalled();
+    const lastCall = onPartialReply.mock.calls[onPartialReply.mock.calls.length - 1];
+    expect(lastCall[0].text).toContain("<thinking>code example</thinking>");
+  });
+
+  it("still strips actual thinking tags outside code spans", () => {
+    let handler: ((evt: unknown) => void) | undefined;
+    const session: StubSession = {
+      subscribe: (fn) => {
+        handler = fn;
+        return () => {};
+      },
+    };
+
+    const onPartialReply = vi.fn();
+
+    subscribeEmbeddedPiSession({
+      session: session as unknown as Parameters<typeof subscribeEmbeddedPiSession>[0]["session"],
+      runId: "run",
+      onPartialReply,
+    });
+
+    handler?.({
+      type: "message_update",
+      message: { role: "assistant" },
+      assistantMessageEvent: {
+        type: "text_delta",
+        delta: "Hello <thinking>internal thought</thinking> world",
+      },
+    });
+
+    expect(onPartialReply).toHaveBeenCalled();
+    const lastCall = onPartialReply.mock.calls[onPartialReply.mock.calls.length - 1];
+    expect(lastCall[0].text).not.toContain("internal thought");
+    expect(lastCall[0].text).toContain("Hello");
+    expect(lastCall[0].text).toContain("world");
+  });
+});
--- a/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.lifecycle.ts
@@ -1,6 +1,7 @@
 import type { AgentEvent } from "@mariozechner/pi-agent-core";

 import { emitAgentEvent } from "../infra/agent-events.js";
+import { createInlineCodeState } from "../markdown/code-spans.js";
 import type { EmbeddedPiSubscribeContext } from "./pi-embedded-subscribe.handlers.types.js";

 export function handleAgentStart(ctx: EmbeddedPiSubscribeContext) {
@@ -75,6 +76,7 @@ export function handleAgentEnd(ctx: EmbeddedPiSubscribeContext) {

  ctx.state.blockState.thinking = false;
  ctx.state.blockState.final = false;
+  ctx.state.blockState.inlineCode = createInlineCodeState();

  if (ctx.state.pendingCompactionRetry > 0) {
    ctx.resolveCompactionRetry();
--- a/src/agents/pi-embedded-subscribe.handlers.messages.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.messages.ts
@@ -17,6 +17,7 @@ import {
  formatReasoningMessage,
  promoteThinkingTagsToBlocks,
 } from "./pi-embedded-utils.js";
+import { createInlineCodeState } from "../markdown/code-spans.js";

 export function handleMessageStart(
  ctx: EmbeddedPiSubscribeContext,
@@ -103,6 +104,7 @@ export function handleMessageUpdate(
    .stripBlockTags(ctx.state.deltaBuffer, {
      thinking: false,
      final: false,
+      inlineCode: createInlineCodeState(),
    })
    .trim();
  if (next && next !== ctx.state.lastStreamedAssistant) {
@@ -240,5 +242,6 @@ export function handleMessageEnd(
  ctx.blockChunker?.reset();
  ctx.state.blockState.thinking = false;
  ctx.state.blockState.final = false;
+  ctx.state.blockState.inlineCode = createInlineCodeState();
  ctx.state.lastStreamedAssistant = undefined;
 }
--- a/src/agents/pi-embedded-subscribe.handlers.types.ts
+++ b/src/agents/pi-embedded-subscribe.handlers.types.ts
@@ -1,6 +1,7 @@
 import type { AgentEvent, AgentMessage } from "@mariozechner/pi-agent-core";

 import type { ReasoningLevel } from "../auto-reply/thinking.js";
+import type { InlineCodeState } from "../markdown/code-spans.js";
 import type { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js";
 import type { MessagingToolSend } from "./pi-embedded-messaging.js";
 import type {
@@ -27,7 +28,7 @@ export type EmbeddedPiSubscribeState = {

  deltaBuffer: string;
  blockBuffer: string;
-  blockState: { thinking: boolean; final: boolean };
+  blockState: { thinking: boolean; final: boolean; inlineCode: InlineCodeState };
  lastStreamedAssistant?: string;
  lastStreamedReasoning?: string;
  lastBlockReplyText?: string;
@@ -56,7 +57,10 @@ export type EmbeddedPiSubscribeContext = {

  shouldEmitToolResult: () => boolean;
  emitToolSummary: (toolName?: string, meta?: string) => void;
-  stripBlockTags: (text: string, state: { thinking: boolean; final: boolean }) => string;
+  stripBlockTags: (
+    text: string,
+    state: { thinking: boolean; final: boolean; inlineCode?: InlineCodeState },
+  ) => string;
  emitBlockChunk: (text: string) => void;
  flushBlockReplyBuffer: () => void;
  emitReasoningStream: (text: string) => void;
--- a/src/agents/pi-embedded-subscribe.ts
+++ b/src/agents/pi-embedded-subscribe.ts
@@ -1,6 +1,8 @@
 import { parseReplyDirectives } from "../auto-reply/reply/reply-directives.js";
 import { formatToolAggregate } from "../auto-reply/tool-meta.js";
 import { createSubsystemLogger } from "../logging.js";
+import type { InlineCodeState } from "../markdown/code-spans.js";
+import { buildCodeSpanIndex, createInlineCodeState } from "../markdown/code-spans.js";
 import { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js";
 import {
  isMessagingToolDuplicateNormalized,
@@ -38,7 +40,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
    deltaBuffer: "",
    blockBuffer: "",
    // Track if a streamed chunk opened a <think> block (stateful across chunks).
-    blockState: { thinking: false, final: false },
+    blockState: { thinking: false, final: false, inlineCode: createInlineCodeState() },
    lastStreamedAssistant: undefined,
    lastStreamedReasoning: undefined,
    lastBlockReplyText: undefined,
@@ -72,6 +74,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
    blockChunker?.reset();
    state.blockState.thinking = false;
    state.blockState.final = false;
+    state.blockState.inlineCode = createInlineCodeState();
    state.lastStreamedAssistant = undefined;
    state.lastBlockReplyText = undefined;
    state.lastStreamedReasoning = undefined;
@@ -185,9 +188,15 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
    }
  };

-  const stripBlockTags = (text: string, state: { thinking: boolean; final: boolean }): string => {
+  const stripBlockTags = (
+    text: string,
+    state: { thinking: boolean; final: boolean; inlineCode?: InlineCodeState },
+  ): string => {
    if (!text) return text;

+    const inlineStateStart = state.inlineCode ?? createInlineCodeState();
+    const codeSpans = buildCodeSpanIndex(text, inlineStateStart);
+
    // 1. Handle <think> blocks (stateful, strip content inside)
    let processed = "";
    THINKING_TAG_SCAN_RE.lastIndex = 0;
@@ -195,6 +204,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
    let inThinking = state.thinking;
    for (const match of text.matchAll(THINKING_TAG_SCAN_RE)) {
      const idx = match.index ?? 0;
+      if (codeSpans.isInside(idx)) continue;
      if (!inThinking) {
        processed += text.slice(lastIndex, idx);
      }
@@ -211,9 +221,11 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
    // If enforcement is disabled, we still strip the tags themselves to prevent
    // hallucinations (e.g. Minimax copying the style) from leaking, but we
    // do not enforce buffering/extraction logic.
+    const finalCodeSpans = buildCodeSpanIndex(processed, inlineStateStart);
    if (!params.enforceFinalTag) {
+      state.inlineCode = finalCodeSpans.inlineState;
      FINAL_TAG_SCAN_RE.lastIndex = 0;
-      return processed.replace(FINAL_TAG_SCAN_RE, "");
+      return stripTagsOutsideCodeSpans(processed, FINAL_TAG_SCAN_RE, finalCodeSpans.isInside);
    }

    // If enforcement is enabled, only return text that appeared inside a <final> block.
@@ -225,6 +237,7 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar

    for (const match of processed.matchAll(FINAL_TAG_SCAN_RE)) {
      const idx = match.index ?? 0;
+      if (finalCodeSpans.isInside(idx)) continue;
      const isClose = match[1] === "/";

      if (!inFinal && !isClose) {
@@ -254,7 +267,27 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar

    // Hardened Cleanup: Remove any remaining <final> tags that might have been
    // missed (e.g. nested tags or hallucinations) to prevent leakage.
-    return result.replace(FINAL_TAG_SCAN_RE, "");
+    const resultCodeSpans = buildCodeSpanIndex(result, inlineStateStart);
+    state.inlineCode = resultCodeSpans.inlineState;
+    return stripTagsOutsideCodeSpans(result, FINAL_TAG_SCAN_RE, resultCodeSpans.isInside);
+  };
+
+  const stripTagsOutsideCodeSpans = (
+    text: string,
+    pattern: RegExp,
+    isInside: (index: number) => boolean,
+  ) => {
+    let output = "";
+    let lastIndex = 0;
+    pattern.lastIndex = 0;
+    for (const match of text.matchAll(pattern)) {
+      const idx = match.index ?? 0;
+      if (isInside(idx)) continue;
+      output += text.slice(lastIndex, idx);
+      lastIndex = idx + match[0].length;
+    }
+    output += text.slice(lastIndex);
+    return output;
  };

  const emitBlockChunk = (text: string) => {
--- a/src/markdown/code-spans.ts
+++ b/src/markdown/code-spans.ts
@@ -0,0 +1,108 @@
+import { parseFenceSpans, type FenceSpan } from "./fences.js";
+
+export type InlineCodeState = {
+  open: boolean;
+  ticks: number;
+};
+
+export function createInlineCodeState(): InlineCodeState {
+  return { open: false, ticks: 0 };
+}
+
+type InlineCodeSpansResult = {
+  spans: Array<[number, number]>;
+  state: InlineCodeState;
+};
+
+export type CodeSpanIndex = {
+  inlineState: InlineCodeState;
+  isInside: (index: number) => boolean;
+};
+
+export function buildCodeSpanIndex(
+  text: string,
+  inlineState?: InlineCodeState,
+): CodeSpanIndex {
+  const fenceSpans = parseFenceSpans(text);
+  const startState = inlineState
+    ? { open: inlineState.open, ticks: inlineState.ticks }
+    : createInlineCodeState();
+  const { spans: inlineSpans, state: nextInlineState } = parseInlineCodeSpans(
+    text,
+    fenceSpans,
+    startState,
+  );
+
+  return {
+    inlineState: nextInlineState,
+    isInside: (index: number) =>
+      isInsideFenceSpan(index, fenceSpans) || isInsideInlineSpan(index, inlineSpans),
+  };
+}
+
+function parseInlineCodeSpans(
+  text: string,
+  fenceSpans: FenceSpan[],
+  initialState: InlineCodeState,
+): InlineCodeSpansResult {
+  const spans: Array<[number, number]> = [];
+  let open = initialState.open;
+  let ticks = initialState.ticks;
+  let openStart = open ? 0 : -1;
+
+  let i = 0;
+  while (i < text.length) {
+    const fence = findFenceSpanAtInclusive(fenceSpans, i);
+    if (fence) {
+      i = fence.end;
+      continue;
+    }
+
+    if (text[i] !== "`") {
+      i += 1;
+      continue;
+    }
+
+    const runStart = i;
+    let runLength = 0;
+    while (i < text.length && text[i] === "`") {
+      runLength += 1;
+      i += 1;
+    }
+
+    if (!open) {
+      open = true;
+      ticks = runLength;
+      openStart = runStart;
+      continue;
+    }
+
+    if (runLength === ticks) {
+      spans.push([openStart, i]);
+      open = false;
+      ticks = 0;
+      openStart = -1;
+    }
+  }
+
+  if (open) {
+    spans.push([openStart, text.length]);
+  }
+
+  return {
+    spans,
+    state: { open, ticks },
+  };
+}
+
+function findFenceSpanAtInclusive(spans: FenceSpan[], index: number): FenceSpan | undefined {
+  return spans.find((span) => index >= span.start && index < span.end);
+}
+
+function isInsideFenceSpan(index: number, spans: FenceSpan[]): boolean {
+  return spans.some((span) => index >= span.start && index < span.end);
+}
+
+function isInsideInlineSpan(index: number, spans: Array<[number, number]>): boolean {
+  return spans.some(([start, end]) => index >= start && index < end);
+}