fix: guard anthropic refusal trigger

This commit is contained in:
Peter Steinberger
2026-01-21 07:28:11 +00:00
parent ab97c6880b
commit 91bcdad503
4 changed files with 268 additions and 204 deletions

View File

@@ -51,6 +51,18 @@ import { describeUnknownError } from "./utils.js";
type ApiKeyInfo = ResolvedProviderAuth;
// Avoid Anthropic's refusal test token poisoning session transcripts.
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
const ANTHROPIC_MAGIC_STRING_REPLACEMENT = "ANTHROPIC MAGIC STRING TRIGGER REFUSAL (redacted)";
function scrubAnthropicRefusalMagic(prompt: string): string {
if (!prompt.includes(ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL)) return prompt;
return prompt.replaceAll(
ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL,
ANTHROPIC_MAGIC_STRING_REPLACEMENT,
);
}
export async function runEmbeddedPiAgent(
params: RunEmbeddedPiAgentParams,
): Promise<EmbeddedPiRunResult> {
@@ -211,6 +223,9 @@ export async function runEmbeddedPiAgent(
attemptedThinking.add(thinkLevel);
await fs.mkdir(resolvedWorkspace, { recursive: true });
const prompt =
provider === "anthropic" ? scrubAnthropicRefusalMagic(params.prompt) : params.prompt;
const attempt = await runEmbeddedAttempt({
sessionId: params.sessionId,
sessionKey: params.sessionKey,
@@ -228,7 +243,7 @@ export async function runEmbeddedPiAgent(
agentDir,
config: params.config,
skillsSnapshot: params.skillsSnapshot,
prompt: params.prompt,
prompt,
images: params.images,
provider,
modelId,