fix: enforce reasoning tags on fallback providers (#810) (thanks @mcinteerj)
This commit is contained in:
@@ -10,6 +10,7 @@
|
|||||||
- Tools: allow Claude/Gemini tool param aliases (`file_path`, `old_string`, `new_string`) while enforcing required params at runtime. (#793 — thanks @hsrvc)
|
- Tools: allow Claude/Gemini tool param aliases (`file_path`, `old_string`, `new_string`) while enforcing required params at runtime. (#793 — thanks @hsrvc)
|
||||||
- Gemini: downgrade tool-call history missing `thought_signature` to avoid INVALID_ARGUMENT errors. (#793 — thanks @hsrvc)
|
- Gemini: downgrade tool-call history missing `thought_signature` to avoid INVALID_ARGUMENT errors. (#793 — thanks @hsrvc)
|
||||||
- Messaging: enforce context isolation for message tool sends across providers (normalized targets + tests). (#793 — thanks @hsrvc)
|
- Messaging: enforce context isolation for message tool sends across providers (normalized targets + tests). (#793 — thanks @hsrvc)
|
||||||
|
- Auto-reply: re-evaluate reasoning tag enforcement on fallback providers to prevent leaked reasoning. (#810 — thanks @mcinteerj)
|
||||||
|
|
||||||
## 2026.1.12-3
|
## 2026.1.12-3
|
||||||
|
|
||||||
|
|||||||
178
src/auto-reply/reply/agent-runner.reasoning-tags.test.ts
Normal file
178
src/auto-reply/reply/agent-runner.reasoning-tags.test.ts
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||||
|
|
||||||
|
import type { SessionEntry } from "../../config/sessions.js";
|
||||||
|
import type { TemplateContext } from "../templating.js";
|
||||||
|
import { DEFAULT_MEMORY_FLUSH_PROMPT } from "./memory-flush.js";
|
||||||
|
import type { FollowupRun, QueueSettings } from "./queue.js";
|
||||||
|
import { createMockTypingController } from "./test-helpers.js";
|
||||||
|
|
||||||
|
const runEmbeddedPiAgentMock = vi.fn();
|
||||||
|
const runWithModelFallbackMock = vi.fn();
|
||||||
|
|
||||||
|
vi.mock("../../agents/model-fallback.js", () => ({
|
||||||
|
runWithModelFallback: (params: {
|
||||||
|
provider: string;
|
||||||
|
model: string;
|
||||||
|
run: (provider: string, model: string) => Promise<unknown>;
|
||||||
|
}) => runWithModelFallbackMock(params),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("../../agents/pi-embedded.js", () => ({
|
||||||
|
queueEmbeddedPiMessage: vi.fn().mockReturnValue(false),
|
||||||
|
runEmbeddedPiAgent: (params: unknown) => runEmbeddedPiAgentMock(params),
|
||||||
|
}));
|
||||||
|
|
||||||
|
vi.mock("./queue.js", async () => {
|
||||||
|
const actual =
|
||||||
|
await vi.importActual<typeof import("./queue.js")>("./queue.js");
|
||||||
|
return {
|
||||||
|
...actual,
|
||||||
|
enqueueFollowupRun: vi.fn(),
|
||||||
|
scheduleFollowupDrain: vi.fn(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
import { runReplyAgent } from "./agent-runner.js";
|
||||||
|
|
||||||
|
type EmbeddedPiAgentParams = {
|
||||||
|
enforceFinalTag?: boolean;
|
||||||
|
prompt?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function createRun(params?: {
|
||||||
|
sessionEntry?: SessionEntry;
|
||||||
|
sessionKey?: string;
|
||||||
|
agentCfgContextTokens?: number;
|
||||||
|
}) {
|
||||||
|
const typing = createMockTypingController();
|
||||||
|
const sessionCtx = {
|
||||||
|
Provider: "whatsapp",
|
||||||
|
OriginatingTo: "+15550001111",
|
||||||
|
AccountId: "primary",
|
||||||
|
MessageSid: "msg",
|
||||||
|
} as unknown as TemplateContext;
|
||||||
|
const resolvedQueue = { mode: "interrupt" } as unknown as QueueSettings;
|
||||||
|
const sessionKey = params?.sessionKey ?? "main";
|
||||||
|
const followupRun = {
|
||||||
|
prompt: "hello",
|
||||||
|
summaryLine: "hello",
|
||||||
|
enqueuedAt: Date.now(),
|
||||||
|
run: {
|
||||||
|
agentId: "main",
|
||||||
|
agentDir: "/tmp/agent",
|
||||||
|
sessionId: "session",
|
||||||
|
sessionKey,
|
||||||
|
messageProvider: "whatsapp",
|
||||||
|
sessionFile: "/tmp/session.jsonl",
|
||||||
|
workspaceDir: "/tmp",
|
||||||
|
config: {},
|
||||||
|
skillsSnapshot: {},
|
||||||
|
provider: "anthropic",
|
||||||
|
model: "claude",
|
||||||
|
thinkLevel: "low",
|
||||||
|
verboseLevel: "off",
|
||||||
|
elevatedLevel: "off",
|
||||||
|
bashElevated: {
|
||||||
|
enabled: false,
|
||||||
|
allowed: false,
|
||||||
|
defaultLevel: "off",
|
||||||
|
},
|
||||||
|
timeoutMs: 1_000,
|
||||||
|
blockReplyBreak: "message_end",
|
||||||
|
},
|
||||||
|
} as unknown as FollowupRun;
|
||||||
|
|
||||||
|
return runReplyAgent({
|
||||||
|
commandBody: "hello",
|
||||||
|
followupRun,
|
||||||
|
queueKey: "main",
|
||||||
|
resolvedQueue,
|
||||||
|
shouldSteer: false,
|
||||||
|
shouldFollowup: false,
|
||||||
|
isActive: false,
|
||||||
|
isStreaming: false,
|
||||||
|
typing,
|
||||||
|
sessionCtx,
|
||||||
|
sessionEntry: params?.sessionEntry,
|
||||||
|
sessionKey,
|
||||||
|
defaultModel: "anthropic/claude-opus-4-5",
|
||||||
|
agentCfgContextTokens: params?.agentCfgContextTokens,
|
||||||
|
resolvedVerboseLevel: "off",
|
||||||
|
isNewSession: false,
|
||||||
|
blockStreamingEnabled: false,
|
||||||
|
resolvedBlockStreamingBreak: "message_end",
|
||||||
|
shouldInjectGroupIntro: false,
|
||||||
|
typingMode: "instant",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("runReplyAgent fallback reasoning tags", () => {
|
||||||
|
beforeEach(() => {
|
||||||
|
runEmbeddedPiAgentMock.mockReset();
|
||||||
|
runWithModelFallbackMock.mockReset();
|
||||||
|
});
|
||||||
|
|
||||||
|
it("enforces <final> when the fallback provider requires reasoning tags", async () => {
|
||||||
|
runEmbeddedPiAgentMock.mockResolvedValueOnce({
|
||||||
|
payloads: [{ text: "ok" }],
|
||||||
|
meta: {},
|
||||||
|
});
|
||||||
|
runWithModelFallbackMock.mockImplementationOnce(
|
||||||
|
async ({
|
||||||
|
run,
|
||||||
|
}: {
|
||||||
|
run: (provider: string, model: string) => Promise<unknown>;
|
||||||
|
}) => ({
|
||||||
|
result: await run("google-antigravity", "gemini-3"),
|
||||||
|
provider: "google-antigravity",
|
||||||
|
model: "gemini-3",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
await createRun();
|
||||||
|
|
||||||
|
const call = runEmbeddedPiAgentMock.mock.calls[0]?.[0] as
|
||||||
|
| EmbeddedPiAgentParams
|
||||||
|
| undefined;
|
||||||
|
expect(call?.enforceFinalTag).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("enforces <final> during memory flush on fallback providers", async () => {
|
||||||
|
runEmbeddedPiAgentMock.mockImplementation(
|
||||||
|
async (params: EmbeddedPiAgentParams) => {
|
||||||
|
if (params.prompt === DEFAULT_MEMORY_FLUSH_PROMPT) {
|
||||||
|
return { payloads: [], meta: {} };
|
||||||
|
}
|
||||||
|
return { payloads: [{ text: "ok" }], meta: {} };
|
||||||
|
},
|
||||||
|
);
|
||||||
|
runWithModelFallbackMock.mockImplementation(
|
||||||
|
async ({
|
||||||
|
run,
|
||||||
|
}: {
|
||||||
|
run: (provider: string, model: string) => Promise<unknown>;
|
||||||
|
}) => ({
|
||||||
|
result: await run("google-antigravity", "gemini-3"),
|
||||||
|
provider: "google-antigravity",
|
||||||
|
model: "gemini-3",
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
await createRun({
|
||||||
|
sessionEntry: {
|
||||||
|
sessionId: "session",
|
||||||
|
updatedAt: Date.now(),
|
||||||
|
totalTokens: 1_000_000,
|
||||||
|
compactionCount: 0,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const flushCall = runEmbeddedPiAgentMock.mock.calls.find(
|
||||||
|
([params]) =>
|
||||||
|
(params as EmbeddedPiAgentParams | undefined)?.prompt ===
|
||||||
|
DEFAULT_MEMORY_FLUSH_PROMPT,
|
||||||
|
)?.[0] as EmbeddedPiAgentParams | undefined;
|
||||||
|
|
||||||
|
expect(flushCall?.enforceFinalTag).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -40,13 +40,13 @@ import { getProviderDock } from "../../providers/dock.js";
|
|||||||
import type { ProviderThreadingToolContext } from "../../providers/plugins/types.js";
|
import type { ProviderThreadingToolContext } from "../../providers/plugins/types.js";
|
||||||
import { normalizeProviderId } from "../../providers/registry.js";
|
import { normalizeProviderId } from "../../providers/registry.js";
|
||||||
import { defaultRuntime } from "../../runtime.js";
|
import { defaultRuntime } from "../../runtime.js";
|
||||||
|
import { isReasoningTagProvider } from "../../utils/provider-utils.js";
|
||||||
import {
|
import {
|
||||||
estimateUsageCost,
|
estimateUsageCost,
|
||||||
formatTokenCount,
|
formatTokenCount,
|
||||||
formatUsd,
|
formatUsd,
|
||||||
resolveModelCostConfig,
|
resolveModelCostConfig,
|
||||||
} from "../../utils/usage-format.js";
|
} from "../../utils/usage-format.js";
|
||||||
import { isReasoningTagProvider } from "../../utils/provider-utils.js";
|
|
||||||
import { stripHeartbeatToken } from "../heartbeat.js";
|
import { stripHeartbeatToken } from "../heartbeat.js";
|
||||||
import type { OriginatingChannelType, TemplateContext } from "../templating.js";
|
import type { OriginatingChannelType, TemplateContext } from "../templating.js";
|
||||||
import { normalizeVerboseLevel, type VerboseLevel } from "../thinking.js";
|
import { normalizeVerboseLevel, type VerboseLevel } from "../thinking.js";
|
||||||
|
|||||||
Reference in New Issue
Block a user