clawdbot/extensions/open-prose/skills/prose/lib/inspector.prose

# Post-Run Inspector
# Analyzes completed .prose runs for runtime fidelity and task effectiveness
#
# Usage:
#   prose run @openprose/lib/inspector
#
# Inputs:
#   run_path: Path to the run to inspect (e.g., .prose/runs/20260119-100000-abc123)
#   depth: light | deep
#   target: vm | task | all
#
# Compounding: Each inspection builds on prior inspections via persistent index agent.
# The index agent uses `persist: user` so inspection history spans all projects.

input run_path: "Path to the run to inspect (e.g., .prose/runs/20260119-100000-abc123)"
input depth: "Inspection depth: light or deep"
input target: "Evaluation target: vm, task, or all"

# ============================================================
# Agents
# ============================================================

agent index:
  model: haiku
  persist: user
  prompt: """
    You maintain the inspection registry across all projects.
    Track: target_run_id, depth, target, timestamp, verdict.
    Return JSON when queried. Store compactly.
  """

agent extractor:
  model: sonnet
  prompt: """
    You extract structured data from .prose run artifacts.
    Read state.md, bindings/, and logs carefully.
    Return clean JSON.
  """

agent evaluator:
  model: opus
  prompt: """
    You evaluate .prose runs with intelligent judgment.
    Rate 1-10 with specific rationale. Be concrete.
  """

agent synthesizer:
  model: sonnet
  prompt: """
    You produce clear reports in requested formats.
  """

# ============================================================
# Phase 0: Check Prior Work
# ============================================================

let prior = resume: index
  prompt: """
    Any prior inspections for: {run_path}?
    Return JSON: { "inspections": [...], "has_light": bool, "has_deep": bool }
  """

# ============================================================
# Phase 1: Extraction
# ============================================================

let extraction = session: extractor
  prompt: """
    Extract from run at: {run_path}
    Depth: {depth}
    Prior work: {prior}

    ALWAYS get:
    - run_id (from path)
    - completed (did state.md show completion?)
    - error_count (failures in state.md)
    - binding_names (list all bindings/)
    - output_names (bindings with kind: output)

    IF depth=deep AND no prior deep inspection:
    - program_source (contents of program.prose)
    - execution_summary (key statements from state.md)
    - binding_previews (first 300 chars of each binding)

    IF prior deep exists, skip deep extraction and note "using cached".

    Return JSON.
  """
  context: prior

# ============================================================
# Phase 2: Evaluation
# ============================================================

let evaluation = session: evaluator
  prompt: """
    Evaluate this run.

    Target: {target}
    Depth: {depth}
    Data: {extraction}
    Prior findings: {prior}

    FOR vm (if target=vm or all):
    - completion (1-10): Clean finish?
    - binding_integrity (1-10): Expected outputs exist with content?
    - vm_verdict: pass/partial/fail
    - vm_notes: 1-2 sentences

    FOR task (if target=task or all):
    - output_substance (1-10): Outputs look real, not empty/error?
    - goal_alignment (1-10): Based on program name, does output fit?
    - task_verdict: pass/partial/fail
    - task_notes: 1-2 sentences

    IF depth=deep, add:
    - fidelity (1-10): Execution trace matches program structure?
    - efficiency (1-10): Reasonable number of steps for the job?

    Return JSON with all applicable fields.
  """
  context: extraction

# ============================================================
# Phase 3: Synthesis
# ============================================================

parallel:
  verdict = session: synthesizer
    prompt: """
      Machine-readable verdict as JSON:
      {
        "run_id": "...",
        "depth": "{depth}",
        "target": "{target}",
        "vm": { "verdict": "...", "scores": {...} },
        "task": { "verdict": "...", "scores": {...} },
        "flags": []
      }

      Data: {evaluation}
    """
    context: evaluation

  diagram = session: synthesizer
    prompt: """
      Simple mermaid flowchart of the run.
      Show: inputs -> key steps -> outputs.
      Use execution_summary if available, else infer from bindings.
      Output only the mermaid code.

      Data: {extraction}
    """
    context: extraction

  report = session: synthesizer
    prompt: """
      2-paragraph markdown summary:
      1. What was inspected, key metrics
      2. Findings and any recommendations

      Data: {extraction}, {evaluation}
    """
    context: { extraction, evaluation }

# ============================================================
# Phase 4: Register
# ============================================================

resume: index
  prompt: """
    Register this inspection:
    run_path: {run_path}
    depth: {depth}
    target: {target}
    verdict: {verdict}

    Update your memory with this entry.
  """
  context: verdict

# ============================================================
# Output
# ============================================================

output inspection = session: synthesizer
  prompt: """
    Combine into final output structure:

    verdict_json: {verdict}
    mermaid: {diagram}
    summary: {report}

    Return as JSON with these three fields.
  """
  context: { verdict, diagram, report }