fix: guard anthropic refusal trigger
This commit is contained in:
@@ -51,6 +51,18 @@ import { describeUnknownError } from "./utils.js";
|
||||
|
||||
type ApiKeyInfo = ResolvedProviderAuth;
|
||||
|
||||
// Avoid Anthropic's refusal test token poisoning session transcripts.
|
||||
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
|
||||
const ANTHROPIC_MAGIC_STRING_REPLACEMENT = "ANTHROPIC MAGIC STRING TRIGGER REFUSAL (redacted)";
|
||||
|
||||
function scrubAnthropicRefusalMagic(prompt: string): string {
|
||||
if (!prompt.includes(ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL)) return prompt;
|
||||
return prompt.replaceAll(
|
||||
ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL,
|
||||
ANTHROPIC_MAGIC_STRING_REPLACEMENT,
|
||||
);
|
||||
}
|
||||
|
||||
export async function runEmbeddedPiAgent(
|
||||
params: RunEmbeddedPiAgentParams,
|
||||
): Promise<EmbeddedPiRunResult> {
|
||||
@@ -211,6 +223,9 @@ export async function runEmbeddedPiAgent(
|
||||
attemptedThinking.add(thinkLevel);
|
||||
await fs.mkdir(resolvedWorkspace, { recursive: true });
|
||||
|
||||
const prompt =
|
||||
provider === "anthropic" ? scrubAnthropicRefusalMagic(params.prompt) : params.prompt;
|
||||
|
||||
const attempt = await runEmbeddedAttempt({
|
||||
sessionId: params.sessionId,
|
||||
sessionKey: params.sessionKey,
|
||||
@@ -228,7 +243,7 @@ export async function runEmbeddedPiAgent(
|
||||
agentDir,
|
||||
config: params.config,
|
||||
skillsSnapshot: params.skillsSnapshot,
|
||||
prompt: params.prompt,
|
||||
prompt,
|
||||
images: params.images,
|
||||
provider,
|
||||
modelId,
|
||||
|
||||
@@ -38,6 +38,7 @@ const PROVIDERS = parseFilter(process.env.CLAWDBOT_LIVE_GATEWAY_PROVIDERS);
|
||||
const THINKING_LEVEL = "high";
|
||||
const THINKING_TAG_RE = /<\s*\/?\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
|
||||
const FINAL_TAG_RE = /<\s*\/?\s*final\s*>/i;
|
||||
const ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL = "ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL";
|
||||
|
||||
const describeLive = LIVE || GATEWAY_LIVE ? describe : describe.skip;
|
||||
|
||||
@@ -120,6 +121,73 @@ function isEmptyStreamText(text: string): boolean {
|
||||
return text.includes("request ended without sending any chunks");
|
||||
}
|
||||
|
||||
function buildAnthropicRefusalToken(): string {
|
||||
const suffix = randomUUID().replace(/-/g, "");
|
||||
return `${ANTHROPIC_MAGIC_STRING_TRIGGER_REFUSAL}_${suffix}`;
|
||||
}
|
||||
|
||||
async function runAnthropicRefusalProbe(params: {
|
||||
client: GatewayClient;
|
||||
sessionKey: string;
|
||||
modelKey: string;
|
||||
label: string;
|
||||
thinkingLevel: string;
|
||||
}): Promise<void> {
|
||||
logProgress(`${params.label}: refusal-probe`);
|
||||
const magic = buildAnthropicRefusalToken();
|
||||
const runId = randomUUID();
|
||||
const probe = await params.client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey: params.sessionKey,
|
||||
idempotencyKey: `idem-${runId}-refusal`,
|
||||
message: `Reply with the single word ok. Test token: ${magic}`,
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (probe?.status !== "ok") {
|
||||
throw new Error(`refusal probe failed: status=${String(probe?.status)}`);
|
||||
}
|
||||
const probeText = extractPayloadText(probe?.result);
|
||||
assertNoReasoningTags({
|
||||
text: probeText,
|
||||
model: params.modelKey,
|
||||
phase: "refusal-probe",
|
||||
label: params.label,
|
||||
});
|
||||
if (!/\bok\b/i.test(probeText)) {
|
||||
throw new Error(`refusal probe missing ok: ${probeText}`);
|
||||
}
|
||||
|
||||
const followupId = randomUUID();
|
||||
const followup = await params.client.request<AgentFinalPayload>(
|
||||
"agent",
|
||||
{
|
||||
sessionKey: params.sessionKey,
|
||||
idempotencyKey: `idem-${followupId}-refusal-followup`,
|
||||
message: "Now reply with exactly: still ok.",
|
||||
thinking: params.thinkingLevel,
|
||||
deliver: false,
|
||||
},
|
||||
{ expectFinal: true },
|
||||
);
|
||||
if (followup?.status !== "ok") {
|
||||
throw new Error(`refusal followup failed: status=${String(followup?.status)}`);
|
||||
}
|
||||
const followupText = extractPayloadText(followup?.result);
|
||||
assertNoReasoningTags({
|
||||
text: followupText,
|
||||
model: params.modelKey,
|
||||
phase: "refusal-followup",
|
||||
label: params.label,
|
||||
});
|
||||
if (!/\bstill\b/i.test(followupText) || !/\bok\b/i.test(followupText)) {
|
||||
throw new Error(`refusal followup missing expected text: ${followupText}`);
|
||||
}
|
||||
}
|
||||
|
||||
function randomImageProbeCode(len = 6): string {
|
||||
// Chosen to avoid common OCR confusions in our 5x7 bitmap font.
|
||||
// Notably: 0↔8, B↔8, 6↔9, 3↔B, D↔0.
|
||||
@@ -736,6 +804,16 @@ async function runGatewayModelSuite(params: GatewayModelSuiteParams) {
|
||||
}
|
||||
}
|
||||
|
||||
if (model.provider === "anthropic") {
|
||||
await runAnthropicRefusalProbe({
|
||||
client,
|
||||
sessionKey,
|
||||
modelKey,
|
||||
label: progressLabel,
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
});
|
||||
}
|
||||
|
||||
logProgress(`${progressLabel}: done`);
|
||||
break;
|
||||
} catch (err) {
|
||||
|
||||
Reference in New Issue
Block a user