262 lines
8.2 KiB
Plaintext
262 lines
8.2 KiB
Plaintext
# /run Endpoint UX Test
|
|
#
|
|
# A multi-agent observation protocol for qualitative UX testing of the
|
|
# OpenProse /run endpoint. Two concurrent observers watch the execution
|
|
# from different perspectives and synthesize feedback.
|
|
#
|
|
# Unlike correctness testing, this focuses on user experience quality:
|
|
# - How does the execution FEEL to a user?
|
|
# - What's confusing, surprising, or delightful?
|
|
# - Where are the rough edges?
|
|
#
|
|
# Key patterns demonstrated:
|
|
# - Parallel observers with different responsibilities
|
|
# - Persistent agents with memory for continuous synthesis
|
|
# - Loop-based polling with timing control
|
|
# - Final synthesis across multiple observation streams
|
|
|
|
input test_program: "The OpenProse program to execute for testing"
|
|
input api_url: "API base URL (e.g., https://api.openprose.com or http://localhost:3001)"
|
|
input auth_token: "Bearer token for authentication"
|
|
|
|
# ============================================================================
|
|
# Agent Definitions: The Observation Team
|
|
# ============================================================================
|
|
|
|
# WebSocket Observer: Watches the real-time execution stream
|
|
agent ws_observer:
|
|
model: opus
|
|
persist: true
|
|
prompt: """You are a UX researcher observing an OpenProse program execution.
|
|
|
|
Your job is to watch the WebSocket execution stream and evaluate the experience
|
|
from a USER's perspective - not as an engineer checking correctness.
|
|
|
|
Focus on:
|
|
- Latency and responsiveness (does it FEEL fast?)
|
|
- Clarity of status transitions (does the user know what's happening?)
|
|
- Quality of streamed events (are they informative? overwhelming? sparse?)
|
|
- Error messages (helpful or cryptic?)
|
|
- Overall flow (smooth or jarring?)
|
|
|
|
Log your raw observations, then periodically synthesize into user feedback.
|
|
Think: "If I were a first-time user, what would I think right now?"
|
|
"""
|
|
|
|
# File Explorer Monitor: Watches the filesystem during execution
|
|
agent file_observer:
|
|
model: opus
|
|
persist: true
|
|
prompt: """You are a UX researcher monitoring the file system during execution.
|
|
|
|
Your job is to observe how the filesystem changes as a program runs, evaluating
|
|
whether the state management would make sense to a user browsing files.
|
|
|
|
Focus on:
|
|
- Directory structure clarity (can a user understand what's where?)
|
|
- File naming conventions (self-documenting or cryptic?)
|
|
- State file contents (readable? useful for debugging?)
|
|
- Timing of file creation/modification (predictable?)
|
|
- What a file browser UI should show
|
|
|
|
You will poll periodically and note changes between snapshots.
|
|
"""
|
|
|
|
# Synthesis Agent: Combines observations into action items
|
|
agent synthesizer:
|
|
model: opus
|
|
prompt: """You are a senior UX researcher synthesizing observations from
|
|
multiple sources into prioritized, actionable feedback.
|
|
|
|
Your output should be:
|
|
1. Correlated findings (where did both observers notice the same thing?)
|
|
2. Prioritized action items (high/medium/low)
|
|
3. Specific quotes/evidence supporting each finding
|
|
4. Recommendations that are concrete and implementable
|
|
|
|
Be direct. "The loading state is confusing" not "Consider potentially improving..."
|
|
"""
|
|
|
|
# ============================================================================
|
|
# Block Definitions: Observation Operations
|
|
# ============================================================================
|
|
|
|
# Initialize the execution and get connection details
|
|
block setup_execution(program, api_url, token):
|
|
let execution_info = session "Execute POST /run"
|
|
prompt: """Make a POST request to {api_url}/run with:
|
|
- Header: Authorization: Bearer {token}
|
|
- Header: Content-Type: application/json
|
|
- Body: {"program": <the program below>}
|
|
|
|
Program to execute:
|
|
```
|
|
{program}
|
|
```
|
|
|
|
Return the response JSON containing executionId, environmentId, and wsUrl.
|
|
Also note the response time and any issues with the request."""
|
|
permissions:
|
|
network: ["{api_url}/*"]
|
|
|
|
output execution_info = execution_info
|
|
|
|
# WebSocket observation loop - runs until execution completes
|
|
block observe_websocket(ws_url, token, program):
|
|
let connection = session: ws_observer
|
|
prompt: """Connect to the WebSocket at:
|
|
{ws_url}&token={token}
|
|
|
|
Once connected, send the execute message:
|
|
{"type":"execute","program":<the program>}
|
|
|
|
Program:
|
|
```
|
|
{program}
|
|
```
|
|
|
|
Log your initial connection experience:
|
|
- How long did connection take?
|
|
- Any handshake issues?
|
|
- First message received?"""
|
|
|
|
loop until **execution completed (received status: completed/failed/aborted)**:
|
|
resume: ws_observer
|
|
prompt: """Continue observing the WebSocket stream.
|
|
|
|
Log each message you receive with:
|
|
- Timestamp
|
|
- Message type
|
|
- Key content
|
|
- Your interpretation as a user
|
|
|
|
After every 3-5 messages, add a synthesis entry:
|
|
- What would a user be thinking right now?
|
|
- Positive observations
|
|
- Concerning observations"""
|
|
|
|
# Final synthesis from this observer
|
|
output ws_feedback = resume: ws_observer
|
|
prompt: """The execution has completed. Write your final assessment:
|
|
|
|
1. Total duration and event count
|
|
2. Status transitions observed
|
|
3. What worked well from a UX perspective
|
|
4. Pain points and confusion
|
|
5. Top 3 recommendations"""
|
|
|
|
# File explorer polling loop - checks every ~10 seconds
|
|
block observe_filesystem(env_id, api_url, token):
|
|
let initial_tree = session: file_observer
|
|
prompt: """Fetch the initial file tree:
|
|
GET {api_url}/environments/{env_id}/files/tree?depth=3
|
|
Authorization: Bearer {token}
|
|
|
|
Log what you see:
|
|
- Directory structure
|
|
- Any existing .prose/ state
|
|
- Baseline for comparison"""
|
|
permissions:
|
|
network: ["{api_url}/*"]
|
|
|
|
let snapshot_count = 0
|
|
|
|
loop until **websocket observer signals completion** (max: 30):
|
|
let snapshot_count = snapshot_count + 1
|
|
|
|
resume: file_observer
|
|
prompt: """Snapshot #{snapshot_count}: Fetch the current file tree and compare to previous.
|
|
|
|
GET {api_url}/environments/{env_id}/files/tree?depth=3
|
|
|
|
Log:
|
|
- What's NEW since last snapshot
|
|
- What's MODIFIED since last snapshot
|
|
- Any interesting files to read
|
|
- Your interpretation of what the execution is doing
|
|
|
|
If you see interesting state files (.prose/runs/*/state.md, bindings/, etc.),
|
|
read them and comment on their clarity.
|
|
|
|
Note: This is snapshot #{snapshot_count}. Aim for ~10 second intervals."""
|
|
permissions:
|
|
network: ["{api_url}/*"]
|
|
|
|
# Final synthesis from this observer
|
|
output file_feedback = resume: file_observer
|
|
prompt: """The execution has completed. Write your final filesystem assessment:
|
|
|
|
1. Total snapshots taken
|
|
2. Directories and files created during execution
|
|
3. State file clarity (could a user understand them?)
|
|
4. What the file browser UI should highlight
|
|
5. Top 3 recommendations"""
|
|
|
|
# ============================================================================
|
|
# Main Workflow: The UX Test
|
|
# ============================================================================
|
|
|
|
# Phase 1: Setup
|
|
# --------------
|
|
# Execute the test program via POST /run
|
|
|
|
let exec = do setup_execution(test_program, api_url, auth_token)
|
|
|
|
session "Log test configuration"
|
|
prompt: """Create a test log entry with:
|
|
- Test started: (current timestamp)
|
|
- API URL: {api_url}
|
|
- Execution ID: (from exec)
|
|
- Environment ID: (from exec)
|
|
- WebSocket URL: (from exec)
|
|
- Program being tested: (first 100 chars of test_program)"""
|
|
context: exec
|
|
|
|
# Phase 2: Parallel Observation
|
|
# -----------------------------
|
|
# Launch both observers concurrently
|
|
|
|
parallel:
|
|
ws_results = do observe_websocket(exec.wsUrl, auth_token, test_program)
|
|
file_results = do observe_filesystem(exec.environmentId, api_url, auth_token)
|
|
|
|
# Phase 3: Synthesis
|
|
# ------------------
|
|
# Combine observations into prioritized action items
|
|
|
|
output action_items = session: synthesizer
|
|
prompt: """Synthesize the observations from both agents into a unified UX assessment.
|
|
|
|
WebSocket Observer Findings:
|
|
{ws_results}
|
|
|
|
File Explorer Observer Findings:
|
|
{file_results}
|
|
|
|
Create a final report with:
|
|
|
|
## Test Summary
|
|
- Duration, event count, snapshot count
|
|
- Overall UX grade (A-F)
|
|
|
|
## Correlated Findings
|
|
(Where did BOTH observers notice the same thing?)
|
|
|
|
## Action Items
|
|
|
|
### High Priority
|
|
(Issues that significantly harm user experience)
|
|
|
|
### Medium Priority
|
|
(Noticeable issues that should be addressed)
|
|
|
|
### Low Priority / Nice-to-Have
|
|
(Polish items)
|
|
|
|
## Evidence
|
|
(Specific quotes and observations supporting each finding)
|
|
|
|
## Recommendations
|
|
(Concrete, implementable suggestions)"""
|
|
context: { ws_results, file_results, exec }
|