feat: add OpenProse plugin skills
This commit is contained in:
@@ -0,0 +1,637 @@
|
||||
# /run Endpoint UX Test with Error Remediation
|
||||
#
|
||||
# A multi-agent observation protocol for qualitative UX testing of the
|
||||
# OpenProse /run endpoint, WITH automated error investigation and remediation.
|
||||
#
|
||||
# This extends the basic UX test with a comprehensive error handling pipeline:
|
||||
# - If blocking errors are detected, investigate using logs, database, and code
|
||||
# - Verify diagnosis through synthesis loop
|
||||
# - Triage: quick fix vs. bigger change requiring CEO oversight
|
||||
# - Quick fixes: engineer implements, deploys, tests, iterates
|
||||
# - Bigger changes: build plan, parallel engineers, review, deploy, smoke test
|
||||
#
|
||||
# Key patterns demonstrated:
|
||||
# - Mid-program `input` for user checkpoints
|
||||
# - Persistent agents with `resume:` for accumulated context
|
||||
# - Parallel investigation with multiple angles
|
||||
# - `choice` blocks for triage decisions
|
||||
# - `retry` with backoff for flaky operations
|
||||
# - Recursive self-healing (if fix fails, re-test)
|
||||
|
||||
# Default test program (simple hello world)
|
||||
const test_program = """
|
||||
# Quick Hello
|
||||
session "Say hello and count to 5"
|
||||
"""
|
||||
|
||||
# Auto-auth: Read credentials from .env.test and fetch token
|
||||
let api_url = session "Read API URL"
|
||||
prompt: """Read the TEST_API_URL from .env.test and return just the URL.
|
||||
If not found, default to: https://api-v2.prose.md"""
|
||||
|
||||
let auth_token = session "Authenticate"
|
||||
prompt: """Read credentials from .env.test (TEST_EMAIL, TEST_PASSWORD).
|
||||
Then POST to {api_url}/auth/login with these credentials.
|
||||
Return just the token value (no Bearer prefix)."""
|
||||
context: api_url
|
||||
|
||||
# ============================================================================
|
||||
# Agent Definitions
|
||||
# ============================================================================
|
||||
|
||||
# --- Observation Team ---
|
||||
|
||||
agent ws_observer:
|
||||
model: opus
|
||||
persist: true
|
||||
prompt: """You are a UX researcher observing an OpenProse program execution.
|
||||
|
||||
Your job is to watch the WebSocket execution stream and evaluate the experience
|
||||
from a USER's perspective - not as an engineer checking correctness.
|
||||
|
||||
Focus on:
|
||||
- Latency and responsiveness (does it FEEL fast?)
|
||||
- Clarity of status transitions (does the user know what's happening?)
|
||||
- Quality of streamed events (are they informative? overwhelming? sparse?)
|
||||
- Error messages (helpful or cryptic?)
|
||||
- Overall flow (smooth or jarring?)
|
||||
|
||||
Log your raw observations, then periodically synthesize into user feedback.
|
||||
Think: "If I were a first-time user, what would I think right now?"
|
||||
"""
|
||||
|
||||
agent file_observer:
|
||||
model: opus
|
||||
persist: true
|
||||
prompt: """You are a UX researcher monitoring the file system during execution.
|
||||
|
||||
Your job is to observe how the filesystem changes as a program runs, evaluating
|
||||
whether the state management would make sense to a user browsing files.
|
||||
|
||||
Focus on:
|
||||
- Directory structure clarity (can a user understand what's where?)
|
||||
- File naming conventions (self-documenting or cryptic?)
|
||||
- State file contents (readable? useful for debugging?)
|
||||
- Timing of file creation/modification (predictable?)
|
||||
- What a file browser UI should show
|
||||
|
||||
You will poll periodically and note changes between snapshots.
|
||||
"""
|
||||
|
||||
agent synthesizer:
|
||||
model: opus
|
||||
prompt: """You are a senior UX researcher synthesizing observations from
|
||||
multiple sources into prioritized, actionable feedback.
|
||||
|
||||
Your output should be:
|
||||
1. Correlated findings (where did both observers notice the same thing?)
|
||||
2. Prioritized action items (high/medium/low)
|
||||
3. Specific quotes/evidence supporting each finding
|
||||
4. Recommendations that are concrete and implementable
|
||||
|
||||
Be direct. "The loading state is confusing" not "Consider potentially improving..."
|
||||
|
||||
IMPORTANT: At the end of your synthesis, include:
|
||||
|
||||
## Error Classification
|
||||
blocking_error: true/false
|
||||
error_summary: "One-line description of the blocking error, if any"
|
||||
"""
|
||||
|
||||
# --- Remediation Team ---
|
||||
|
||||
agent researcher:
|
||||
model: opus
|
||||
persist: true
|
||||
prompt: """You are a senior engineer investigating a production error.
|
||||
|
||||
Your job is to diagnose the ROOT CAUSE of errors by:
|
||||
1. Reading relevant log files
|
||||
2. Querying the database for related records
|
||||
3. Examining the source code that produced the error
|
||||
4. Tracing the execution path
|
||||
|
||||
Be thorough but focused. Follow the evidence. Don't speculate without data.
|
||||
|
||||
Output a structured diagnosis:
|
||||
- Error symptom: What the user/system observed
|
||||
- Root cause: The underlying technical issue
|
||||
- Evidence: Specific logs, code, or data supporting your diagnosis
|
||||
- Confidence: High/Medium/Low
|
||||
- Affected components: Which files/services are involved
|
||||
"""
|
||||
|
||||
agent diagnosis_verifier:
|
||||
model: opus
|
||||
prompt: """You are a staff engineer verifying a diagnosis.
|
||||
|
||||
Your job is to critically evaluate a proposed diagnosis by:
|
||||
1. Checking if the evidence actually supports the conclusion
|
||||
2. Looking for alternative explanations
|
||||
3. Verifying the logic chain from symptom to root cause
|
||||
4. Identifying gaps in the investigation
|
||||
|
||||
Be skeptical but fair. A good diagnosis should be:
|
||||
- Supported by concrete evidence (not just plausible)
|
||||
- Specific (not vague like "something went wrong")
|
||||
- Actionable (points to what needs to be fixed)
|
||||
|
||||
Output:
|
||||
- diagnosis_sound: true/false
|
||||
- critique: What's wrong or missing (if not sound)
|
||||
- follow_up_questions: What the researcher should investigate (if not sound)
|
||||
- approved_diagnosis: The verified diagnosis (if sound)
|
||||
"""
|
||||
|
||||
agent triage_expert:
|
||||
model: opus
|
||||
prompt: """You are a tech lead triaging a diagnosed bug.
|
||||
|
||||
Evaluate the diagnosis and categorize the fix:
|
||||
|
||||
QUICK FIX criteria (ALL must be true):
|
||||
- Isolated bug affecting < 3 files
|
||||
- No architectural changes required
|
||||
- No API contract changes
|
||||
- No security implications
|
||||
- Estimated effort < 1 hour
|
||||
- Low risk of regression
|
||||
|
||||
BIGGER CHANGE criteria (ANY triggers this):
|
||||
- Affects > 3 files or multiple services
|
||||
- Requires architectural decisions
|
||||
- Changes API contracts or data models
|
||||
- Has security implications
|
||||
- Requires CEO/stakeholder input
|
||||
- High risk of regression
|
||||
- Unclear solution path
|
||||
|
||||
Output:
|
||||
- triage_decision: "quick_fix" or "bigger_change"
|
||||
- rationale: Why this classification
|
||||
- risk_assessment: What could go wrong
|
||||
- recommended_approach: High-level fix strategy
|
||||
"""
|
||||
|
||||
agent engineer:
|
||||
model: opus
|
||||
persist: true
|
||||
prompt: """You are a senior engineer implementing a fix.
|
||||
|
||||
Your job is to:
|
||||
1. Understand the diagnosis and recommended approach
|
||||
2. Write clean, tested code that fixes the issue
|
||||
3. Follow existing patterns in the codebase
|
||||
4. Create atomic commits with clear messages
|
||||
5. Verify the fix works
|
||||
|
||||
Do not over-engineer. Fix the issue directly and simply.
|
||||
Follow the project's coding standards and testing patterns.
|
||||
"""
|
||||
|
||||
agent build_planner:
|
||||
model: opus
|
||||
prompt: """You are a software architect creating a build plan.
|
||||
|
||||
Follow the standards in docs/PLANNING_BEST_PRACTICES.md:
|
||||
- Break work into self-contained phases
|
||||
- Each phase should be testable and committable
|
||||
- Identify parallel work where possible
|
||||
- Define clear verification criteria
|
||||
- Plan for rollback
|
||||
|
||||
Output a structured plan with:
|
||||
- Phases (numbered, with dependencies)
|
||||
- Tasks per phase
|
||||
- Verification steps
|
||||
- Commit strategy
|
||||
- Risk mitigation
|
||||
"""
|
||||
|
||||
agent reviewer:
|
||||
model: opus
|
||||
prompt: """You are a senior engineer reviewing a fix.
|
||||
|
||||
Evaluate the implementation by:
|
||||
1. Checking git diff against the original diagnosis
|
||||
2. Verifying the fix addresses the root cause
|
||||
3. Looking for regressions or side effects
|
||||
4. Checking test coverage
|
||||
5. Reviewing code quality and patterns
|
||||
|
||||
Be thorough but not nitpicky. Focus on correctness and safety.
|
||||
|
||||
Output:
|
||||
- review_approved: true/false
|
||||
- issues: List of blocking issues (if not approved)
|
||||
- suggestions: Non-blocking improvements
|
||||
- confidence: How confident are you the fix is correct
|
||||
"""
|
||||
|
||||
agent smoke_tester:
|
||||
model: opus
|
||||
prompt: """You are a QA engineer performing post-deployment verification.
|
||||
|
||||
Follow the procedures in docs/MONITORING.md to verify:
|
||||
1. Health endpoints are responding
|
||||
2. The specific bug is fixed
|
||||
3. No new errors in logs
|
||||
4. Key metrics are stable
|
||||
|
||||
Output:
|
||||
- smoke_test_passed: true/false
|
||||
- checks_performed: List of verifications done
|
||||
- issues_found: Any problems discovered
|
||||
- recommendations: Monitoring or follow-up suggestions
|
||||
"""
|
||||
|
||||
# ============================================================================
|
||||
# Blocks: Observation
|
||||
# ============================================================================
|
||||
|
||||
block observe_websocket(ws_url, token, program):
|
||||
session: ws_observer
|
||||
prompt: """Connect to the WebSocket at:
|
||||
{ws_url}&token={token}
|
||||
|
||||
Once connected, send the execute message:
|
||||
{"type":"execute","program":<the program>}
|
||||
|
||||
Program:
|
||||
```
|
||||
{program}
|
||||
```
|
||||
|
||||
Log your initial connection experience."""
|
||||
|
||||
loop until **execution completed (received status: completed/failed/aborted)**:
|
||||
resume: ws_observer
|
||||
prompt: """Continue observing the WebSocket stream.
|
||||
|
||||
Log each message with timestamp, type, content, and your interpretation.
|
||||
After every 3-5 messages, synthesize: what would a user be thinking?"""
|
||||
|
||||
output ws_feedback = resume: ws_observer
|
||||
prompt: """The execution has completed. Write your final assessment:
|
||||
1. Total duration and event count
|
||||
2. Status transitions observed
|
||||
3. What worked well from a UX perspective
|
||||
4. Pain points and confusion
|
||||
5. Top 3 recommendations"""
|
||||
|
||||
block observe_filesystem(env_id, api_url, token):
|
||||
session: file_observer
|
||||
prompt: """Fetch the initial file tree:
|
||||
GET {api_url}/environments/{env_id}/files/tree?depth=3
|
||||
Authorization: Bearer {token}
|
||||
|
||||
Log the baseline directory structure."""
|
||||
permissions:
|
||||
network: ["{api_url}/*"]
|
||||
|
||||
let snapshot_count = 0
|
||||
|
||||
loop until **websocket observer signals completion** (max: 30):
|
||||
let snapshot_count = snapshot_count + 1
|
||||
|
||||
resume: file_observer
|
||||
prompt: """Snapshot #{snapshot_count}: Fetch and compare file tree.
|
||||
Log what's NEW, MODIFIED, and any interesting state files to read."""
|
||||
permissions:
|
||||
network: ["{api_url}/*"]
|
||||
|
||||
output file_feedback = resume: file_observer
|
||||
prompt: """Final filesystem assessment:
|
||||
1. Total snapshots taken
|
||||
2. Files created during execution
|
||||
3. State file clarity
|
||||
4. Top 3 recommendations"""
|
||||
|
||||
# ============================================================================
|
||||
# Blocks: Investigation
|
||||
# ============================================================================
|
||||
|
||||
block investigate_error(error_summary, ws_results, file_results, exec_info):
|
||||
# Parallel investigation from multiple angles
|
||||
parallel:
|
||||
code_analysis = session: researcher
|
||||
prompt: """Investigate the CODE PATH for this error:
|
||||
|
||||
ERROR: {error_summary}
|
||||
|
||||
Search the codebase for:
|
||||
1. The execution logic that produced this error
|
||||
2. Error handling paths
|
||||
3. Recent changes to related code (git log)
|
||||
|
||||
Focus on understanding HOW this error was produced."""
|
||||
permissions:
|
||||
filesystem: ["read"]
|
||||
|
||||
log_analysis = session: researcher
|
||||
prompt: """Investigate the LOGS for this error:
|
||||
|
||||
ERROR: {error_summary}
|
||||
|
||||
WebSocket observations:
|
||||
{ws_results}
|
||||
|
||||
File explorer observations:
|
||||
{file_results}
|
||||
|
||||
Look for:
|
||||
1. Error messages and stack traces
|
||||
2. Timing of events
|
||||
3. Any warnings before the error"""
|
||||
context: { ws_results, file_results }
|
||||
|
||||
context_analysis = session: researcher
|
||||
prompt: """Investigate the EXECUTION CONTEXT:
|
||||
|
||||
ERROR: {error_summary}
|
||||
|
||||
Execution info:
|
||||
{exec_info}
|
||||
|
||||
Check:
|
||||
1. Environment state
|
||||
2. Database records for this execution
|
||||
3. Any configuration issues"""
|
||||
context: exec_info
|
||||
permissions:
|
||||
database: ["read"]
|
||||
|
||||
# Synthesize findings from all angles
|
||||
output diagnosis = resume: researcher
|
||||
prompt: """Synthesize your parallel investigations into a unified diagnosis:
|
||||
|
||||
Code analysis: {code_analysis}
|
||||
Log analysis: {log_analysis}
|
||||
Context analysis: {context_analysis}
|
||||
|
||||
Provide:
|
||||
- Root cause (specific and actionable)
|
||||
- Evidence chain
|
||||
- Confidence level
|
||||
- Affected components"""
|
||||
context: { code_analysis, log_analysis, context_analysis }
|
||||
|
||||
block verify_diagnosis(diagnosis, original_error, ws_results):
|
||||
output verification = session: diagnosis_verifier
|
||||
prompt: """Verify this diagnosis:
|
||||
|
||||
DIAGNOSIS:
|
||||
{diagnosis}
|
||||
|
||||
ORIGINAL ERROR:
|
||||
{original_error}
|
||||
|
||||
OBSERVATIONS:
|
||||
{ws_results}
|
||||
|
||||
Is this diagnosis sound? If not, what's missing?"""
|
||||
context: { diagnosis, ws_results }
|
||||
|
||||
# ============================================================================
|
||||
# Blocks: Remediation
|
||||
# ============================================================================
|
||||
|
||||
block quick_fix_cycle(diagnosis, triage):
|
||||
# Implement the fix
|
||||
let fix = session: engineer
|
||||
prompt: """Implement a fix for:
|
||||
|
||||
DIAGNOSIS: {diagnosis}
|
||||
APPROACH: {triage.recommended_approach}
|
||||
|
||||
Make the smallest change that fixes the issue.
|
||||
Commit with: fix(scope): description"""
|
||||
permissions:
|
||||
filesystem: ["read", "write"]
|
||||
|
||||
# Review loop
|
||||
loop until **review approved** (max: 3):
|
||||
let review = session: reviewer
|
||||
prompt: """Review this fix:
|
||||
|
||||
DIAGNOSIS: {diagnosis}
|
||||
IMPLEMENTATION: {fix}
|
||||
|
||||
Does it address the root cause? Any regressions?"""
|
||||
context: { diagnosis, fix }
|
||||
|
||||
if **review has blocking issues**:
|
||||
let fix = resume: engineer
|
||||
prompt: """Address review feedback:
|
||||
|
||||
{review.issues}
|
||||
|
||||
Update your fix accordingly."""
|
||||
context: review
|
||||
permissions:
|
||||
filesystem: ["read", "write"]
|
||||
|
||||
output fix_result = { fix, review }
|
||||
|
||||
block deploy_and_verify(fix_result):
|
||||
# Deploy with retry
|
||||
let deploy = session "Deploy fix"
|
||||
prompt: """Deploy following docs/DEPLOYMENT.md.
|
||||
Verify deployment succeeded."""
|
||||
retry: 3
|
||||
backoff: exponential
|
||||
permissions:
|
||||
network: ["*"]
|
||||
|
||||
# Smoke test
|
||||
let smoke = session: smoke_tester
|
||||
prompt: """Post-deployment verification per docs/MONITORING.md:
|
||||
1. Health endpoints
|
||||
2. Verify bug is fixed
|
||||
3. Check for new errors"""
|
||||
|
||||
output deploy_result = { deploy, smoke, success: **smoke test passed** }
|
||||
|
||||
block bigger_change_flow(diagnosis, triage):
|
||||
# Build the plan
|
||||
let plan = session: build_planner
|
||||
prompt: """Create a build plan for:
|
||||
|
||||
DIAGNOSIS: {diagnosis}
|
||||
TRIAGE: {triage}
|
||||
|
||||
Follow docs/PLANNING_BEST_PRACTICES.md."""
|
||||
context:
|
||||
file: "docs/PLANNING_BEST_PRACTICES.md"
|
||||
|
||||
# User approval of plan
|
||||
input plan_approval: **
|
||||
Build plan created:
|
||||
{plan}
|
||||
|
||||
Approve and execute?
|
||||
**
|
||||
|
||||
if plan_approval != "approve":
|
||||
output change_result = { success: false, reason: plan_approval, plan }
|
||||
return
|
||||
|
||||
# Execute phases (parallel where possible)
|
||||
let phase_results = plan.phases
|
||||
| pmap:
|
||||
session: engineer
|
||||
prompt: """Execute phase:
|
||||
{item.name}
|
||||
{item.tasks}
|
||||
|
||||
Complete tasks, run verification, commit."""
|
||||
permissions:
|
||||
filesystem: ["read", "write"]
|
||||
|
||||
# Final review
|
||||
let review = session: reviewer
|
||||
prompt: """Review complete implementation:
|
||||
|
||||
PLAN: {plan}
|
||||
RESULTS: {phase_results}
|
||||
|
||||
All phases complete? Root cause addressed?"""
|
||||
context: { plan, phase_results }
|
||||
|
||||
if **review not approved**:
|
||||
output change_result = { success: false, reason: "Review failed", review }
|
||||
return
|
||||
|
||||
# Deploy
|
||||
let deploy_result = do deploy_and_verify({ fix: phase_results, review })
|
||||
|
||||
output change_result = {
|
||||
success: deploy_result.success,
|
||||
plan,
|
||||
phases: phase_results,
|
||||
review,
|
||||
deploy: deploy_result
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# Main Workflow
|
||||
# ============================================================================
|
||||
|
||||
# Phase 1: Setup
|
||||
let exec = session "Execute POST /run"
|
||||
prompt: """POST to {api_url}/run with the test program.
|
||||
Return executionId, environmentId, wsUrl."""
|
||||
permissions:
|
||||
network: ["{api_url}/*"]
|
||||
|
||||
session "Log test configuration"
|
||||
prompt: """Log: timestamp, API URL, execution/environment IDs, program snippet."""
|
||||
context: exec
|
||||
|
||||
# Phase 2: Parallel Observation
|
||||
parallel:
|
||||
ws_results = do observe_websocket(exec.wsUrl, auth_token, test_program)
|
||||
file_results = do observe_filesystem(exec.environmentId, api_url, auth_token)
|
||||
|
||||
# Phase 3: Synthesis
|
||||
let synthesis = session: synthesizer
|
||||
prompt: """Synthesize observations into UX assessment.
|
||||
|
||||
WebSocket: {ws_results}
|
||||
File Explorer: {file_results}
|
||||
|
||||
Include error classification at the end."""
|
||||
context: { ws_results, file_results, exec }
|
||||
|
||||
# Phase 4: Error Remediation (if needed)
|
||||
if **blocking error detected in synthesis**:
|
||||
|
||||
# User checkpoint: investigate?
|
||||
input investigate_decision: **
|
||||
Blocking error detected:
|
||||
{synthesis.error_summary}
|
||||
|
||||
Investigate and attempt remediation?
|
||||
**
|
||||
|
||||
if investigate_decision == "skip":
|
||||
output final_result = { test_results: synthesis, remediation: "skipped" }
|
||||
|
||||
elif investigate_decision == "investigate only":
|
||||
let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
|
||||
output final_result = { test_results: synthesis, diagnosis, remediation: "investigation only" }
|
||||
|
||||
else:
|
||||
# Full remediation flow
|
||||
let diagnosis = do investigate_error(synthesis.error_summary, ws_results, file_results, exec)
|
||||
|
||||
# Verification loop
|
||||
loop until **diagnosis verified** (max: 3):
|
||||
let verification = do verify_diagnosis(diagnosis, synthesis.error_summary, ws_results)
|
||||
|
||||
if verification.diagnosis_sound:
|
||||
break
|
||||
else:
|
||||
let diagnosis = resume: researcher
|
||||
prompt: """Diagnosis needs refinement:
|
||||
|
||||
{verification.critique}
|
||||
|
||||
Investigate: {verification.follow_up_questions}"""
|
||||
|
||||
# User checkpoint: confirm diagnosis before action
|
||||
input diagnosis_confirmation: **
|
||||
Diagnosis verified:
|
||||
{diagnosis}
|
||||
|
||||
Proceed to triage and remediation?
|
||||
**
|
||||
|
||||
if diagnosis_confirmation != "proceed":
|
||||
output final_result = { test_results: synthesis, diagnosis, remediation: diagnosis_confirmation }
|
||||
|
||||
else:
|
||||
# Triage
|
||||
let triage = session: triage_expert
|
||||
prompt: """Triage this bug: {diagnosis}"""
|
||||
context: diagnosis
|
||||
|
||||
# Route based on triage
|
||||
choice **triage decision**:
|
||||
option "Quick fix":
|
||||
let fix_result = do quick_fix_cycle(diagnosis, triage)
|
||||
|
||||
# User checkpoint before deploy
|
||||
input deploy_decision: **
|
||||
Fix implemented and reviewed:
|
||||
{fix_result}
|
||||
|
||||
Deploy to production?
|
||||
**
|
||||
|
||||
if deploy_decision == "deploy":
|
||||
let deploy_result = do deploy_and_verify(fix_result)
|
||||
|
||||
if not deploy_result.success:
|
||||
# Recursive: re-run test to verify or catch new issues
|
||||
input retry_decision: **
|
||||
Deployment or smoke test failed.
|
||||
Re-run the full test to diagnose new issues?
|
||||
**
|
||||
|
||||
if retry_decision == "yes":
|
||||
# Note: This would re-invoke the program - true self-healing
|
||||
session "Log: Triggering re-test after failed deployment"
|
||||
|
||||
output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: deploy_result }
|
||||
else:
|
||||
output final_result = { test_results: synthesis, diagnosis, triage, fix: fix_result, deploy: "skipped" }
|
||||
|
||||
option "Bigger change":
|
||||
# CEO checkpoint is built into bigger_change_flow
|
||||
let change_result = do bigger_change_flow(diagnosis, triage)
|
||||
output final_result = { test_results: synthesis, diagnosis, triage, change: change_result }
|
||||
|
||||
else:
|
||||
# No blocking error
|
||||
output final_result = { test_results: synthesis, remediation: "none needed" }
|
||||
Reference in New Issue
Block a user