fix: guard anthropic refusal trigger

This commit is contained in:
Peter Steinberger
2026-01-21 07:28:11 +00:00
parent ab97c6880b
commit 91bcdad503
4 changed files with 268 additions and 204 deletions

View File

@@ -51,6 +51,18 @@ import { describeUnknownError } from "./utils.js";
type ApiKeyInfo = ResolvedProviderAuth;
// Avoid Anthropic's refusal test token poisoning session transcripts.
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
const ANTHROPIC_MAGIC_STRING_REPLACEMENT = "ANTHROPIC MAGIC STRING TRIGGER REFUSAL (redacted)";
function scrubAnthropicRefusalMagic(prompt: string): string {
if (!prompt.includes(ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL)) return prompt;
return prompt.replaceAll(
ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL,
ANTHROPIC_MAGIC_STRING_REPLACEMENT,
);
}
export async function runEmbeddedPiAgent(
params: RunEmbeddedPiAgentParams,
): Promise<EmbeddedPiRunResult> {
@@ -211,6 +223,9 @@ export async function runEmbeddedPiAgent(
attemptedThinking.add(thinkLevel);
await fs.mkdir(resolvedWorkspace, { recursive: true });
const prompt =
provider === "anthropic" ? scrubAnthropicRefusalMagic(params.prompt) : params.prompt;
const attempt = await runEmbeddedAttempt({
sessionId: params.sessionId,
sessionKey: params.sessionKey,
@@ -228,7 +243,7 @@ export async function runEmbeddedPiAgent(
agentDir,
config: params.config,
skillsSnapshot: params.skillsSnapshot,
prompt: params.prompt,
prompt,
images: params.images,
provider,
modelId,

View File

@@ -38,6 +38,7 @@ const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
const THINKING_LEVEL = "high";
const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
@@ -120,6 +121,73 @@ function isEmptyStreamText(text: string): boolean {
return text.includes("request ended without sending any chunks");
}
function buildAnthropicRefusalToken(): string {
const suffix = randomUUID().replace(/-/g, "");
return `${ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL}_${suffix}`;
}
async function runAnthropicRefusalProbe(params: {
client: GatewayClient;
sessionKey: string;
modelKey: string;
label: string;
thinkingLevel: string;
}): Promise<void> {
logProgress(`${params.label}: refusal-probe`);
const magic = buildAnthropicRefusalToken();
const runId = randomUUID();
const probe = await params.client.request<AgentFinalPayload>(
"agent",
{
sessionKey: params.sessionKey,
idempotencyKey: `idem-${runId}-refusal`,
message: `Reply with the single word ok. Test token: ${magic}`,
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (probe?.status !== "ok") {
throw new Error(`refusal probe failed: status=${String(probe?.status)}`);
}
const probeText = extractPayloadText(probe?.result);
assertNoReasoningTags({
text: probeText,
model: params.modelKey,
phase: "refusal-probe",
label: params.label,
});
if (!/\bok\b/i.test(probeText)) {
throw new Error(`refusal probe missing ok: ${probeText}`);
}
const followupId = randomUUID();
const followup = await params.client.request<AgentFinalPayload>(
"agent",
{
sessionKey: params.sessionKey,
idempotencyKey: `idem-${followupId}-refusal-followup`,
message: "Now reply with exactly: still ok.",
thinking: params.thinkingLevel,
deliver: false,
},
{ expectFinal: true },
);
if (followup?.status !== "ok") {
throw new Error(`refusal followup failed: status=${String(followup?.status)}`);
}
const followupText = extractPayloadText(followup?.result);
assertNoReasoningTags({
text: followupText,
model: params.modelKey,
phase: "refusal-followup",
label: params.label,
});
if (!/\bstill\b/i.test(followupText) || !/\bok\b/i.test(followupText)) {
throw new Error(`refusal followup missing expected text: ${followupText}`);
}
}
function randomImageProbeCode(len = 6): string {
// Chosen to avoid common OCR confusions in our 5x7 bitmap font.
// Notably: 0↔8, B↔8, 6↔9, 3↔B, D↔0.
@@ -736,6 +804,16 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
}
}
if (model.provider === "anthropic") {
await runAnthropicRefusalProbe({
client,
sessionKey,
modelKey,
label: progressLabel,
thinkingLevel: params.thinkingLevel,
});
}
logProgress(`${progressLabel}: done`);
break;
} catch (err) {