clawdbot/extensions/open-prose/skills/prose/lib/calibrator.prose

# Calibrator
# Validates that lightweight evaluations are reliable proxies for deep evaluations
#
# Usage:
#   prose run @openprose/lib/calibrator
#
# Purpose:
#   Run both light and deep inspections on the same runs, compare results,
#   and build confidence (or identify gaps) in light evaluations.
#
# Inputs:
#   run_paths: Paths to runs to calibrate on (comma-separated or glob)
#   sample_size: How many runs to sample (if more available)
#
# Outputs:
#   - Agreement rate between light and deep
#   - Cases where they disagree
#   - Recommendations for improving light evaluation

input run_paths: "Paths to runs (comma-separated, or 'recent' for latest)"
input sample_size: "Max runs to analyze (default: 10)"

# ============================================================
# Agents
# ============================================================

agent sampler:
  model: sonnet
  prompt: """
    You select runs for calibration analysis.
    Prefer diverse runs: different programs, outcomes, sizes.
  """

agent comparator:
  model: opus
  prompt: """
    You compare light vs deep evaluation results with nuance.
    Identify agreement, disagreement, and edge cases.
  """

agent statistician:
  model: sonnet
  prompt: """
    You compute statistics and confidence intervals.
  """

agent advisor:
  model: opus
  prompt: """
    You recommend improvements to evaluation criteria.
  """

# ============================================================
# Phase 1: Select Runs
# ============================================================

let selected_runs = session: sampler
  prompt: """
    Select runs for calibration.

    Input: {run_paths}
    Sample size: {sample_size}

    If run_paths is "recent", find recent runs in .prose/runs/
    If specific paths, use those.

    Select a diverse sample:
    - Different programs if possible
    - Mix of successful and partial/failed if available
    - Different sizes (small vs large runs)

    Return list of run paths.
  """

# ============================================================
# Phase 2: Run Both Inspection Depths
# ============================================================

let calibration_data = selected_runs | map:
  # Run light and deep sequentially on each (can't parallel same run)
  let light = session "Light inspection"
    prompt: """
      Run a LIGHT inspection on: {item}

      Evaluate quickly:
      - completion: did it finish cleanly?
      - binding_integrity: do expected outputs exist?
      - output_substance: do outputs have real content?
      - goal_alignment: does output match program purpose?

      Score each 1-10, give verdicts (pass/partial/fail).
      Return JSON.
    """

  let deep = session "Deep inspection"
    prompt: """
      Run a DEEP inspection on: {item}

      Evaluate thoroughly:
      - Read the full program source
      - Trace execution step by step
      - Check each binding's content
      - Evaluate output quality in detail
      - Assess fidelity (did VM follow program correctly?)
      - Assess efficiency (reasonable steps for the job?)

      Score each dimension 1-10, give verdicts.
      Return JSON.
    """
    context: light  # Deep can see light's assessment

  session "Package results"
    prompt: """
      Package the light and deep inspection results.

      Run: {item}
      Light: {light}
      Deep: {deep}

      Return:
      {
        "run_path": "...",
        "light": { verdicts, scores },
        "deep": { verdicts, scores },
        "agreement": {
          "vm_verdict": true/false,
          "task_verdict": true/false,
          "score_delta": { ... }
        }
      }
    """
    context: { light, deep }

# ============================================================
# Phase 3: Statistical Analysis
# ============================================================

let statistics = session: statistician
  prompt: """
    Compute calibration statistics.

    Data: {calibration_data}

    Calculate:
    - Overall agreement rate (how often do light and deep agree?)
    - Agreement by verdict type (vm vs task)
    - Score correlation (do light scores predict deep scores?)
    - Disagreement patterns (when do they diverge?)

    Return:
    {
      "sample_size": N,
      "agreement_rate": { overall, vm, task },
      "score_correlation": { ... },
      "disagreements": [ { run, light_said, deep_said, reason } ],
      "confidence": "high" | "medium" | "low"
    }
  """
  context: calibration_data

# ============================================================
# Phase 4: Recommendations
# ============================================================

let recommendations = session: advisor
  prompt: """
    Based on calibration results, recommend improvements.

    Statistics: {statistics}
    Raw data: {calibration_data}

    If agreement is high (>90%):
    - Light evaluation is reliable
    - Note any edge cases to watch

    If agreement is medium (70-90%):
    - Identify patterns in disagreements
    - Suggest criteria adjustments

    If agreement is low (<70%):
    - Light evaluation needs work
    - Specific recommendations for improvement

    Return:
    {
      "reliability_verdict": "reliable" | "mostly_reliable" | "needs_work",
      "key_findings": [...],
      "recommendations": [
        { "priority": 1, "action": "...", "rationale": "..." }
      ]
    }
  """
  context: { statistics, calibration_data }

# ============================================================
# Output
# ============================================================

output report = session "Format report"
  prompt: """
    Format calibration results as a report.

    Statistics: {statistics}
    Recommendations: {recommendations}

    Include:
    1. Summary: Is light evaluation reliable?
    2. Agreement rates (table)
    3. Disagreement cases (if any)
    4. Recommendations
    5. Confidence level in these results

    Format as markdown.
  """
  context: { statistics, recommendations, calibration_data }