From c012019a8a345a67e336a5f256d680ca9a8e2eac Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 13 Jan 2026 01:46:21 +0000 Subject: [PATCH] fix: enforce reasoning tags on fallback providers (#810) (thanks @mcinteerj) --- CHANGELOG.md | 1 + .../reply/agent-runner.reasoning-tags.test.ts | 178 ++++++++++++++++++ src/auto-reply/reply/agent-runner.ts | 2 +- 3 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 src/auto-reply/reply/agent-runner.reasoning-tags.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 034d0cab1..6084a3cf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Tools: allow Claude/Gemini tool param aliases (`file_path`, `old_string`, `new_string`) while enforcing required params at runtime. (#793 — thanks @hsrvc) - Gemini: downgrade tool-call history missing `thought_signature` to avoid INVALID_ARGUMENT errors. (#793 — thanks @hsrvc) - Messaging: enforce context isolation for message tool sends across providers (normalized targets + tests). (#793 — thanks @hsrvc) +- Auto-reply: re-evaluate reasoning tag enforcement on fallback providers to prevent leaked reasoning. (#810 — thanks @mcinteerj) ## 2026.1.12-3 diff --git a/src/auto-reply/reply/agent-runner.reasoning-tags.test.ts b/src/auto-reply/reply/agent-runner.reasoning-tags.test.ts new file mode 100644 index 000000000..1bd9085b4 --- /dev/null +++ b/src/auto-reply/reply/agent-runner.reasoning-tags.test.ts @@ -0,0 +1,178 @@ +import { beforeEach, describe, expect, it, vi } from "vitest"; + +import type { SessionEntry } from "../../config/sessions.js"; +import type { TemplateContext } from "../templating.js"; +import { DEFAULT_MEMORY_FLUSH_PROMPT } from "./memory-flush.js"; +import type { FollowupRun, QueueSettings } from "./queue.js"; +import { createMockTypingController } from "./test-helpers.js"; + +const runEmbeddedPiAgentMock = vi.fn(); +const runWithModelFallbackMock = vi.fn(); + +vi.mock("../../agents/model-fallback.js", () => ({ + runWithModelFallback: (params: { + provider: string; + model: string; + run: (provider: string, model: string) => Promise; + }) => runWithModelFallbackMock(params), +})); + +vi.mock("../../agents/pi-embedded.js", () => ({ + queueEmbeddedPiMessage: vi.fn().mockReturnValue(false), + runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params), +})); + +vi.mock("./queue.js", async () => { + const actual = + await vi.importActual("./queue.js"); + return { + ...actual, + enqueueFollowupRun: vi.fn(), + scheduleFollowupDrain: vi.fn(), + }; +}); + +import { runReplyAgent } from "./agent-runner.js"; + +type EmbeddedPiAgentParams = { + enforceFinalTag?: boolean; + prompt?: string; +}; + +function createRun(params?: { + sessionEntry?: SessionEntry; + sessionKey?: string; + agentCfgContextTokens?: number; +}) { + const typing = createMockTypingController(); + const sessionCtx = { + Provider: "whatsapp", + OriginatingTo: "+15550001111", + AccountId: "primary", + MessageSid: "msg", + } as unknown as TemplateContext; + const resolvedQueue = { mode: "interrupt" } as unknown as QueueSettings; + const sessionKey = params?.sessionKey ?? "main"; + const followupRun = { + prompt: "hello", + summaryLine: "hello", + enqueuedAt: Date.now(), + run: { + agentId: "main", + agentDir: "/tmp/agent", + sessionId: "session", + sessionKey, + messageProvider: "whatsapp", + sessionFile: "/tmp/session.jsonl", + workspaceDir: "/tmp", + config: {}, + skillsSnapshot: {}, + provider: "anthropic", + model: "claude", + thinkLevel: "low", + verboseLevel: "off", + elevatedLevel: "off", + bashElevated: { + enabled: false, + allowed: false, + defaultLevel: "off", + }, + timeoutMs: 1_000, + blockReplyBreak: "message_end", + }, + } as unknown as FollowupRun; + + return runReplyAgent({ + commandBody: "hello", + followupRun, + queueKey: "main", + resolvedQueue, + shouldSteer: false, + shouldFollowup: false, + isActive: false, + isStreaming: false, + typing, + sessionCtx, + sessionEntry: params?.sessionEntry, + sessionKey, + defaultModel: "anthropic/claude-opus-4-5", + agentCfgContextTokens: params?.agentCfgContextTokens, + resolvedVerboseLevel: "off", + isNewSession: false, + blockStreamingEnabled: false, + resolvedBlockStreamingBreak: "message_end", + shouldInjectGroupIntro: false, + typingMode: "instant", + }); +} + +describe("runReplyAgent fallback reasoning tags", () => { + beforeEach(() => { + runEmbeddedPiAgentMock.mockReset(); + runWithModelFallbackMock.mockReset(); + }); + + it("enforces when the fallback provider requires reasoning tags", async () => { + runEmbeddedPiAgentMock.mockResolvedValueOnce({ + payloads: [{ text: "ok" }], + meta: {}, + }); + runWithModelFallbackMock.mockImplementationOnce( + async ({ + run, + }: { + run: (provider: string, model: string) => Promise; + }) => ({ + result: await run("google-antigravity", "gemini-3"), + provider: "google-antigravity", + model: "gemini-3", + }), + ); + + await createRun(); + + const call = runEmbeddedPiAgentMock.mock.calls[0]?.[0] as + | EmbeddedPiAgentParams + | undefined; + expect(call?.enforceFinalTag).toBe(true); + }); + + it("enforces during memory flush on fallback providers", async () => { + runEmbeddedPiAgentMock.mockImplementation( + async (params: EmbeddedPiAgentParams) => { + if (params.prompt === DEFAULT_MEMORY_FLUSH_PROMPT) { + return { payloads: [], meta: {} }; + } + return { payloads: [{ text: "ok" }], meta: {} }; + }, + ); + runWithModelFallbackMock.mockImplementation( + async ({ + run, + }: { + run: (provider: string, model: string) => Promise; + }) => ({ + result: await run("google-antigravity", "gemini-3"), + provider: "google-antigravity", + model: "gemini-3", + }), + ); + + await createRun({ + sessionEntry: { + sessionId: "session", + updatedAt: Date.now(), + totalTokens: 1_000_000, + compactionCount: 0, + }, + }); + + const flushCall = runEmbeddedPiAgentMock.mock.calls.find( + ([params]) => + (params as EmbeddedPiAgentParams | undefined)?.prompt === + DEFAULT_MEMORY_FLUSH_PROMPT, + )?.[0] as EmbeddedPiAgentParams | undefined; + + expect(flushCall?.enforceFinalTag).toBe(true); + }); +}); diff --git a/src/auto-reply/reply/agent-runner.ts b/src/auto-reply/reply/agent-runner.ts index b986ec71b..98c735998 100644 --- a/src/auto-reply/reply/agent-runner.ts +++ b/src/auto-reply/reply/agent-runner.ts @@ -40,13 +40,13 @@ import { getProviderDock } from "../../providers/dock.js"; import type { ProviderThreadingToolContext } from "../../providers/plugins/types.js"; import { normalizeProviderId } from "../../providers/registry.js"; import { defaultRuntime } from "../../runtime.js"; +import { isReasoningTagProvider } from "../../utils/provider-utils.js"; import { estimateUsageCost, formatTokenCount, formatUsd, resolveModelCostConfig, } from "../../utils/usage-format.js"; -import { isReasoningTagProvider } from "../../utils/provider-utils.js"; import { stripHeartbeatToken } from "../heartbeat.js"; import type { OriginatingChannelType, TemplateContext } from "../templating.js"; import { normalizeVerboseLevel, type VerboseLevel } from "../thinking.js";