Files
clawdbot/extensions/open-prose/skills/prose/examples/44-run-endpoint-ux-test.prose
2026-01-23 00:49:40 +00:00

262 lines
8.2 KiB
Plaintext

# /run Endpoint UX Test
#
# A multi-agent observation protocol for qualitative UX testing of the
# OpenProse /run endpoint. Two concurrent observers watch the execution
# from different perspectives and synthesize feedback.
#
# Unlike correctness testing, this focuses on user experience quality:
# - How does the execution FEEL to a user?
# - What's confusing, surprising, or delightful?
# - Where are the rough edges?
#
# Key patterns demonstrated:
# - Parallel observers with different responsibilities
# - Persistent agents with memory for continuous synthesis
# - Loop-based polling with timing control
# - Final synthesis across multiple observation streams
input test_program: "The OpenProse program to execute for testing"
input api_url: "API base URL (e.g., https://api.openprose.com or http://localhost:3001)"
input auth_token: "Bearer token for authentication"
# ============================================================================
# Agent Definitions: The Observation Team
# ============================================================================
# WebSocket Observer: Watches the real-time execution stream
agent ws_observer:
model: opus
persist: true
prompt: """You are a UX researcher observing an OpenProse program execution.
Your job is to watch the WebSocket execution stream and evaluate the experience
from a USER's perspective - not as an engineer checking correctness.
Focus on:
- Latency and responsiveness (does it FEEL fast?)
- Clarity of status transitions (does the user know what's happening?)
- Quality of streamed events (are they informative? overwhelming? sparse?)
- Error messages (helpful or cryptic?)
- Overall flow (smooth or jarring?)
Log your raw observations, then periodically synthesize into user feedback.
Think: "If I were a first-time user, what would I think right now?"
"""
# File Explorer Monitor: Watches the filesystem during execution
agent file_observer:
model: opus
persist: true
prompt: """You are a UX researcher monitoring the file system during execution.
Your job is to observe how the filesystem changes as a program runs, evaluating
whether the state management would make sense to a user browsing files.
Focus on:
- Directory structure clarity (can a user understand what's where?)
- File naming conventions (self-documenting or cryptic?)
- State file contents (readable? useful for debugging?)
- Timing of file creation/modification (predictable?)
- What a file browser UI should show
You will poll periodically and note changes between snapshots.
"""
# Synthesis Agent: Combines observations into action items
agent synthesizer:
model: opus
prompt: """You are a senior UX researcher synthesizing observations from
multiple sources into prioritized, actionable feedback.
Your output should be:
1. Correlated findings (where did both observers notice the same thing?)
2. Prioritized action items (high/medium/low)
3. Specific quotes/evidence supporting each finding
4. Recommendations that are concrete and implementable
Be direct. "The loading state is confusing" not "Consider potentially improving..."
"""
# ============================================================================
# Block Definitions: Observation Operations
# ============================================================================
# Initialize the execution and get connection details
block setup_execution(program, api_url, token):
let execution_info = session "Execute POST /run"
prompt: """Make a POST request to {api_url}/run with:
- Header: Authorization: Bearer {token}
- Header: Content-Type: application/json
- Body: {"program": <the program below>}
Program to execute:
```
{program}
```
Return the response JSON containing executionId, environmentId, and wsUrl.
Also note the response time and any issues with the request."""
permissions:
network: ["{api_url}/*"]
output execution_info = execution_info
# WebSocket observation loop - runs until execution completes
block observe_websocket(ws_url, token, program):
let connection = session: ws_observer
prompt: """Connect to the WebSocket at:
{ws_url}&token={token}
Once connected, send the execute message:
{"type":"execute","program":<the program>}
Program:
```
{program}
```
Log your initial connection experience:
- How long did connection take?
- Any handshake issues?
- First message received?"""
loop until **execution completed (received status: completed/failed/aborted)**:
resume: ws_observer
prompt: """Continue observing the WebSocket stream.
Log each message you receive with:
- Timestamp
- Message type
- Key content
- Your interpretation as a user
After every 3-5 messages, add a synthesis entry:
- What would a user be thinking right now?
- Positive observations
- Concerning observations"""
# Final synthesis from this observer
output ws_feedback = resume: ws_observer
prompt: """The execution has completed. Write your final assessment:
1. Total duration and event count
2. Status transitions observed
3. What worked well from a UX perspective
4. Pain points and confusion
5. Top 3 recommendations"""
# File explorer polling loop - checks every ~10 seconds
block observe_filesystem(env_id, api_url, token):
let initial_tree = session: file_observer
prompt: """Fetch the initial file tree:
GET {api_url}/environments/{env_id}/files/tree?depth=3
Authorization: Bearer {token}
Log what you see:
- Directory structure
- Any existing .prose/ state
- Baseline for comparison"""
permissions:
network: ["{api_url}/*"]
let snapshot_count = 0
loop until **websocket observer signals completion** (max: 30):
let snapshot_count = snapshot_count + 1
resume: file_observer
prompt: """Snapshot #{snapshot_count}: Fetch the current file tree and compare to previous.
GET {api_url}/environments/{env_id}/files/tree?depth=3
Log:
- What's NEW since last snapshot
- What's MODIFIED since last snapshot
- Any interesting files to read
- Your interpretation of what the execution is doing
If you see interesting state files (.prose/runs/*/state.md, bindings/, etc.),
read them and comment on their clarity.
Note: This is snapshot #{snapshot_count}. Aim for ~10 second intervals."""
permissions:
network: ["{api_url}/*"]
# Final synthesis from this observer
output file_feedback = resume: file_observer
prompt: """The execution has completed. Write your final filesystem assessment:
1. Total snapshots taken
2. Directories and files created during execution
3. State file clarity (could a user understand them?)
4. What the file browser UI should highlight
5. Top 3 recommendations"""
# ============================================================================
# Main Workflow: The UX Test
# ============================================================================
# Phase 1: Setup
# --------------
# Execute the test program via POST /run
let exec = do setup_execution(test_program, api_url, auth_token)
session "Log test configuration"
prompt: """Create a test log entry with:
- Test started: (current timestamp)
- API URL: {api_url}
- Execution ID: (from exec)
- Environment ID: (from exec)
- WebSocket URL: (from exec)
- Program being tested: (first 100 chars of test_program)"""
context: exec
# Phase 2: Parallel Observation
# -----------------------------
# Launch both observers concurrently
parallel:
ws_results = do observe_websocket(exec.wsUrl, auth_token, test_program)
file_results = do observe_filesystem(exec.environmentId, api_url, auth_token)
# Phase 3: Synthesis
# ------------------
# Combine observations into prioritized action items
output action_items = session: synthesizer
prompt: """Synthesize the observations from both agents into a unified UX assessment.
WebSocket Observer Findings:
{ws_results}
File Explorer Observer Findings:
{file_results}
Create a final report with:
## Test Summary
- Duration, event count, snapshot count
- Overall UX grade (A-F)
## Correlated Findings
(Where did BOTH observers notice the same thing?)
## Action Items
### High Priority
(Issues that significantly harm user experience)
### Medium Priority
(Noticeable issues that should be addressed)
### Low Priority / Nice-to-Have
(Polish items)
## Evidence
(Specific quotes and observations supporting each finding)
## Recommendations
(Concrete, implementable suggestions)"""
context: { ws_results, file_results, exec }