From d0c986c4f097482ae970af25768e5ff98aff43f1 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 16 Jan 2026 09:33:59 +0000 Subject: [PATCH] feat: warn on weak model tiers --- CHANGELOG.md | 1 + docs/gateway/security.md | 9 +++++ src/security/audit-extra.ts | 65 +++++++++++++++++++++++++++++++++++++ src/security/audit.test.ts | 18 ++++++++++ 4 files changed, 93 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04ae92ed5..2917b58e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - TUI: show provider/model labels for the active session and default model. - Heartbeat: add per-agent heartbeat configuration and multi-agent docs example. - UI: show gateway auth guidance + doc link on unauthorized Control UI connections. +- Security: warn on weak model tiers (Haiku, below GPT-5, below Claude 4.5) in `clawdbot security audit`. - Fix: list model picker entries as provider/model pairs for explicit selection. (#970) — thanks @mcinteerj. - Fix: align OpenAI image-gen defaults with DALL-E 3 standard quality and document output formats. (#880) — thanks @mkbehr. - Fix: persist `gateway.mode=local` after selecting Local run mode in `clawdbot configure`, even if no other sections are chosen. diff --git a/docs/gateway/security.md b/docs/gateway/security.md index 22552d9f6..3106ca8ba 100644 --- a/docs/gateway/security.md +++ b/docs/gateway/security.md @@ -161,6 +161,15 @@ Even with strong system prompts, **prompt injection is not solved**. What helps - Run sensitive tool execution in a sandbox; keep secrets out of the agent’s reachable filesystem. - **Model choice matters:** older/legacy models can be less robust against prompt injection and tool misuse. Prefer modern, instruction-hardened models for any bot with tools. We recommend Anthropic Opus 4.5 because it’s quite good at recognizing prompt injections (see [“A step forward on safety”](https://www.anthropic.com/news/claude-opus-4-5)). +### Model strength (security note) + +Prompt injection resistance is **not** uniform across model tiers. Smaller/cheaper models are generally more susceptible to tool misuse and instruction hijacking, especially under adversarial prompts. + +Recommendations: +- **Use the latest generation, best-tier model** for any bot that can run tools or touch files/networks. +- **Avoid weaker tiers** (for example, Sonnet or Haiku) for tool-enabled agents or untrusted inboxes. +- If you must use a smaller model, **reduce blast radius** (read-only tools, strong sandboxing, minimal filesystem access, strict allowlists). + ## Reasoning & verbose output in groups `/reasoning` and `/verbose` can expose internal reasoning or tool output that diff --git a/src/security/audit-extra.ts b/src/security/audit-extra.ts index 5fc4332b7..48023d718 100644 --- a/src/security/audit-extra.ts +++ b/src/security/audit-extra.ts @@ -260,11 +260,57 @@ const LEGACY_MODEL_PATTERNS: Array<{ id: string; re: RegExp; label: string }> = { id: "openai.gpt4_legacy", re: /\bgpt-4-(0314|0613)\b/i, label: "Legacy GPT-4 snapshots" }, ]; +const WEAK_TIER_MODEL_PATTERNS: Array<{ id: string; re: RegExp; label: string }> = [ + { id: "anthropic.haiku", re: /\bhaiku\b/i, label: "Haiku tier (smaller model)" }, +]; + +function isGptModel(id: string): boolean { + return /\bgpt-/i.test(id); +} + +function isGpt5OrHigher(id: string): boolean { + return /\bgpt-5(?:\b|[.-])/i.test(id); +} + +function isClaudeModel(id: string): boolean { + return /\bclaude-/i.test(id); +} + +function isClaude45OrHigher(id: string): boolean { + return /\bclaude-[^\s/]*?(?:-4-5\b|4\.5\b)/i.test(id); +} + export function collectModelHygieneFindings(cfg: ClawdbotConfig): SecurityAuditFinding[] { const findings: SecurityAuditFinding[] = []; const models = collectModels(cfg); if (models.length === 0) return findings; + const weakMatches = new Map(); + const addWeakMatch = (model: string, source: string, reason: string) => { + const key = `${model}@@${source}`; + const existing = weakMatches.get(key); + if (!existing) { + weakMatches.set(key, { model, source, reasons: [reason] }); + return; + } + if (!existing.reasons.includes(reason)) existing.reasons.push(reason); + }; + + for (const entry of models) { + for (const pat of WEAK_TIER_MODEL_PATTERNS) { + if (pat.re.test(entry.id)) { + addWeakMatch(entry.id, entry.source, pat.label); + break; + } + } + if (isGptModel(entry.id) && !isGpt5OrHigher(entry.id)) { + addWeakMatch(entry.id, entry.source, "Below GPT-5 family"); + } + if (isClaudeModel(entry.id) && !isClaude45OrHigher(entry.id)) { + addWeakMatch(entry.id, entry.source, "Below Claude 4.5"); + } + } + const matches: Array<{ model: string; source: string; reason: string }> = []; for (const entry of models) { for (const pat of LEGACY_MODEL_PATTERNS) { @@ -293,6 +339,25 @@ export function collectModelHygieneFindings(cfg: ClawdbotConfig): SecurityAuditF }); } + if (weakMatches.size > 0) { + const lines = Array.from(weakMatches.values()) + .slice(0, 12) + .map((m) => `- ${m.model} (${m.reasons.join("; ")}) @ ${m.source}`) + .join("\n"); + const more = weakMatches.size > 12 ? `\n…${weakMatches.size - 12} more` : ""; + findings.push({ + checkId: "models.weak_tier", + severity: "warn", + title: "Some configured models are below recommended tiers", + detail: + "Smaller/older models are generally more susceptible to prompt injection and tool misuse.\n" + + lines + + more, + remediation: + "Use the latest, top-tier model for any bot with tools or untrusted inboxes. Avoid Haiku tiers; prefer GPT-5+ and Claude 4.5+.", + }); + } + return findings; } diff --git a/src/security/audit.test.ts b/src/security/audit.test.ts index 7ff7f349a..fd02f208f 100644 --- a/src/security/audit.test.ts +++ b/src/security/audit.test.ts @@ -269,6 +269,24 @@ describe("security audit", () => { ); }); + it("warns on weak model tiers", async () => { + const cfg: ClawdbotConfig = { + agents: { defaults: { model: { primary: "anthropic/claude-haiku-4-5" } } }, + }; + + const res = await runSecurityAudit({ + config: cfg, + includeFilesystem: false, + includeChannelSecurity: false, + }); + + expect(res.findings).toEqual( + expect.arrayContaining([ + expect.objectContaining({ checkId: "models.weak_tier", severity: "warn" }), + ]), + ); + }); + it("warns when hooks token looks short", async () => { const cfg: ClawdbotConfig = { hooks: { enabled: true, token: "short" },