Merge pull request #810 from mcinteerj/fix/runtime-reasoning-enforcement

fix(auto-reply): enforce reasoning tags on fallback providers
This commit is contained in:
Peter Steinberger
2026-01-13 01:46:41 +00:00
committed by GitHub
3 changed files with 186 additions and 2 deletions

View File

@@ -10,6 +10,7 @@
- Tools: allow Claude/Gemini tool param aliases (`file_path`, `old_string`, `new_string`) while enforcing required params at runtime. (#793 — thanks @hsrvc)
- Gemini: downgrade tool-call history missing `thought_signature` to avoid INVALID_ARGUMENT errors. (#793 — thanks @hsrvc)
- Messaging: enforce context isolation for message tool sends across providers (normalized targets + tests). (#793 — thanks @hsrvc)
- Auto-reply: re-evaluate reasoning tag enforcement on fallback providers to prevent leaked reasoning. (#810 — thanks @mcinteerj)
## 2026.1.12-3

View File

@@ -0,0 +1,178 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import type { SessionEntry } from "../../config/sessions.js";
import type { TemplateContext } from "../templating.js";
import { DEFAULT_MEMORY_FLUSH_PROMPT } from "./memory-flush.js";
import type { FollowupRun, QueueSettings } from "./queue.js";
import { createMockTypingController } from "./test-helpers.js";
const runEmbeddedPiAgentMock = vi.fn();
const runWithModelFallbackMock = vi.fn();
vi.mock("../../agents/model-fallback.js", () => ({
runWithModelFallback: (params: {
provider: string;
model: string;
run: (provider: string, model: string) => Promise<unknown>;
}) => runWithModelFallbackMock(params),
}));
vi.mock("../../agents/pi-embedded.js", () => ({
queueEmbeddedPiMessage: vi.fn().mockReturnValue(false),
runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params),
}));
vi.mock("./queue.js", async () => {
const actual =
await vi.importActual<typeof import("./queue.js")>("./queue.js");
return {
...actual,
enqueueFollowupRun: vi.fn(),
scheduleFollowupDrain: vi.fn(),
};
});
import { runReplyAgent } from "./agent-runner.js";
type EmbeddedPiAgentParams = {
enforceFinalTag?: boolean;
prompt?: string;
};
function createRun(params?: {
sessionEntry?: SessionEntry;
sessionKey?: string;
agentCfgContextTokens?: number;
}) {
const typing = createMockTypingController();
const sessionCtx = {
Provider: "whatsapp",
OriginatingTo: "+15550001111",
AccountId: "primary",
MessageSid: "msg",
} as unknown as TemplateContext;
const resolvedQueue = { mode: "interrupt" } as unknown as QueueSettings;
const sessionKey = params?.sessionKey ?? "main";
const followupRun = {
prompt: "hello",
summaryLine: "hello",
enqueuedAt: Date.now(),
run: {
agentId: "main",
agentDir: "/tmp/agent",
sessionId: "session",
sessionKey,
messageProvider: "whatsapp",
sessionFile: "/tmp/session.jsonl",
workspaceDir: "/tmp",
config: {},
skillsSnapshot: {},
provider: "anthropic",
model: "claude",
thinkLevel: "low",
verboseLevel: "off",
elevatedLevel: "off",
bashElevated: {
enabled: false,
allowed: false,
defaultLevel: "off",
},
timeoutMs: 1_000,
blockReplyBreak: "message_end",
},
} as unknown as FollowupRun;
return runReplyAgent({
commandBody: "hello",
followupRun,
queueKey: "main",
resolvedQueue,
shouldSteer: false,
shouldFollowup: false,
isActive: false,
isStreaming: false,
typing,
sessionCtx,
sessionEntry: params?.sessionEntry,
sessionKey,
defaultModel: "anthropic/claude-opus-4-5",
agentCfgContextTokens: params?.agentCfgContextTokens,
resolvedVerboseLevel: "off",
isNewSession: false,
blockStreamingEnabled: false,
resolvedBlockStreamingBreak: "message_end",
shouldInjectGroupIntro: false,
typingMode: "instant",
});
}
describe("runReplyAgent fallback reasoning tags", () => {
beforeEach(() => {
runEmbeddedPiAgentMock.mockReset();
runWithModelFallbackMock.mockReset();
});
it("enforces <final> when the fallback provider requires reasoning tags", async () => {
runEmbeddedPiAgentMock.mockResolvedValueOnce({
payloads: [{ text: "ok" }],
meta: {},
});
runWithModelFallbackMock.mockImplementationOnce(
async ({
run,
}: {
run: (provider: string, model: string) => Promise<unknown>;
}) => ({
result: await run("google-antigravity", "gemini-3"),
provider: "google-antigravity",
model: "gemini-3",
}),
);
await createRun();
const call = runEmbeddedPiAgentMock.mock.calls[0]?.[0] as
| EmbeddedPiAgentParams
| undefined;
expect(call?.enforceFinalTag).toBe(true);
});
it("enforces <final> during memory flush on fallback providers", async () => {
runEmbeddedPiAgentMock.mockImplementation(
async (params: EmbeddedPiAgentParams) => {
if (params.prompt === DEFAULT_MEMORY_FLUSH_PROMPT) {
return { payloads: [], meta: {} };
}
return { payloads: [{ text: "ok" }], meta: {} };
},
);
runWithModelFallbackMock.mockImplementation(
async ({
run,
}: {
run: (provider: string, model: string) => Promise<unknown>;
}) => ({
result: await run("google-antigravity", "gemini-3"),
provider: "google-antigravity",
model: "gemini-3",
}),
);
await createRun({
sessionEntry: {
sessionId: "session",
updatedAt: Date.now(),
totalTokens: 1_000_000,
compactionCount: 0,
},
});
const flushCall = runEmbeddedPiAgentMock.mock.calls.find(
([params]) =>
(params as EmbeddedPiAgentParams | undefined)?.prompt ===
DEFAULT_MEMORY_FLUSH_PROMPT,
)?.[0] as EmbeddedPiAgentParams | undefined;
expect(flushCall?.enforceFinalTag).toBe(true);
});
});

View File

@@ -40,6 +40,7 @@ import { getProviderDock } from "../../providers/dock.js";
import type { ProviderThreadingToolContext } from "../../providers/plugins/types.js";
import { normalizeProviderId } from "../../providers/registry.js";
import { defaultRuntime } from "../../runtime.js";
import { isReasoningTagProvider } from "../../utils/provider-utils.js";
import {
estimateUsageCost,
formatTokenCount,
@@ -411,7 +412,9 @@ export async function runReplyAgent(params: {
prompt: memoryFlushSettings.prompt,
extraSystemPrompt: flushSystemPrompt,
ownerNumbers: followupRun.run.ownerNumbers,
enforceFinalTag: followupRun.run.enforceFinalTag,
enforceFinalTag:
followupRun.run.enforceFinalTag ||
isReasoningTagProvider(provider),
provider,
model,
authProfileId: followupRun.run.authProfileId,
@@ -659,7 +662,9 @@ export async function runReplyAgent(params: {
prompt: commandBody,
extraSystemPrompt: followupRun.run.extraSystemPrompt,
ownerNumbers: followupRun.run.ownerNumbers,
enforceFinalTag: followupRun.run.enforceFinalTag,
enforceFinalTag:
followupRun.run.enforceFinalTag ||
isReasoningTagProvider(provider),
provider,
model,
authProfileId: followupRun.run.authProfileId,