"""iOS PhoneAgent class for orchestrating iOS phone automation."""
import json
import traceback
from dataclasses import dataclass
from typing import Any, Callable
from phone_agent.actions.handler import do, finish, parse_action
from phone_agent.actions.handler_ios import IOSActionHandler
from phone_agent.config import get_messages, get_system_prompt
from phone_agent.model import ModelClient, ModelConfig
from phone_agent.model.client import MessageBuilder
from phone_agent.xctest import XCTestConnection, get_current_app, get_screenshot
@dataclass
class IOSAgentConfig:
"""Configuration for the iOS PhoneAgent."""
max_steps: int = 100
wda_url: str = "http://localhost:8100"
session_id: str | None = None
device_id: str | None = None # iOS device UDID
lang: str = "cn"
system_prompt: str | None = None
verbose: bool = True
step_callback: Callable[[Any], str | None] | None = None
"""Callback after each step. Return 'stop' to interrupt, or a new task string to switch."""
before_action_callback: Callable[[dict[str, Any]], dict[str, Any] | None] | None = None
"""Callback before executing action. Return modified action dict, or None to proceed as-is."""
def __post_init__(self):
if self.system_prompt is None:
self.system_prompt = get_system_prompt(self.lang)
@dataclass
class StepResult:
"""Result of a single agent step."""
success: bool
finished: bool
action: dict[str, Any] | None
thinking: str
message: str | None = None
step_count: int = 0
class IOSPhoneAgent:
"""
AI-powered agent for automating iOS phone interactions.
The agent uses a vision-language model to understand screen content
and decide on actions to complete user tasks via WebDriverAgent.
Args:
model_config: Configuration for the AI model.
agent_config: Configuration for the iOS agent behavior.
confirmation_callback: Optional callback for sensitive action confirmation.
takeover_callback: Optional callback for takeover requests.
Callbacks in agent_config:
step_callback: Called after each step with StepResult.
- Return 'stop' to interrupt the task
- Return a new task string to switch tasks
- Return None to continue normally
before_action_callback: Called before executing an action with the action dict.
- Return modified action dict to override
- Return None to execute the original action
Example:
>>> from phone_agent.agent_ios import IOSPhoneAgent, IOSAgentConfig
>>> from phone_agent.model import ModelConfig
>>>
>>> # With callback
>>> def on_step(result):
... if result.step_count > 10:
... return "stop" # Interrupt after 10 steps
... return None
>>>
>>> model_config = ModelConfig(base_url="http://localhost:8000/v1")
>>> agent_config = IOSAgentConfig(wda_url="http://localhost:8100", step_callback=on_step)
>>> agent = IOSPhoneAgent(model_config, agent_config)
>>> agent.run("Open Safari and search for Apple")
"""
def __init__(
self,
model_config: ModelConfig | None = None,
agent_config: IOSAgentConfig | None = None,
confirmation_callback: Callable[[str], bool] | None = None,
takeover_callback: Callable[[str], None] | None = None,
):
self.model_config = model_config or ModelConfig()
self.agent_config = agent_config or IOSAgentConfig()
self.model_client = ModelClient(self.model_config)
# Initialize WDA connection and create session if needed
self.wda_connection = XCTestConnection(wda_url=self.agent_config.wda_url)
# Auto-create session if not provided
if self.agent_config.session_id is None:
success, session_id = self.wda_connection.start_wda_session()
if success and session_id != "session_started":
self.agent_config.session_id = session_id
if self.agent_config.verbose:
print(f"ā
Created WDA session: {session_id}")
elif self.agent_config.verbose:
print(f"ā ļø Using default WDA session (no explicit session ID)")
self.action_handler = IOSActionHandler(
wda_url=self.agent_config.wda_url,
session_id=self.agent_config.session_id,
confirmation_callback=confirmation_callback,
takeover_callback=takeover_callback,
)
self._context: list[dict[str, Any]] = []
self._step_count = 0
def run(self, task: str) -> str:
"""
Run the agent to complete a task.
Args:
task: Natural language description of the task.
Returns:
Final message from the agent.
"""
self._context = []
self._step_count = 0
# First step with user prompt
result = self._execute_step(task, is_first=True)
if result.finished:
return result.message or "Task completed"
# Continue until finished or max steps reached
while self._step_count < self.agent_config.max_steps:
result = self._execute_step(is_first=False)
if result.finished:
return result.message or "Task completed"
return "Max steps reached"
def step(self, task: str | None = None) -> StepResult:
"""
Execute a single step of the agent.
Useful for manual control or debugging.
Args:
task: Task description (only needed for first step).
Returns:
StepResult with step details.
"""
is_first = len(self._context) == 0
if is_first and not task:
raise ValueError("Task is required for the first step")
return self._execute_step(task, is_first)
def reset(self) -> None:
"""Reset the agent state for a new task."""
self._context = []
self._step_count = 0
def _execute_step(
self, user_prompt: str | None = None, is_first: bool = False
) -> StepResult:
"""Execute a single step of the agent loop."""
self._step_count += 1
# Capture current screen state
screenshot = get_screenshot(
wda_url=self.agent_config.wda_url,
session_id=self.agent_config.session_id,
device_id=self.agent_config.device_id,
)
current_app = get_current_app(
wda_url=self.agent_config.wda_url, session_id=self.agent_config.session_id
)
# Build messages
if is_first:
self._context.append(
MessageBuilder.create_system_message(self.agent_config.system_prompt)
)
screen_info = MessageBuilder.build_screen_info(current_app)
text_content = f"{user_prompt}\n\n{screen_info}"
self._context.append(
MessageBuilder.create_user_message(
text=text_content, image_base64=screenshot.base64_data
)
)
else:
screen_info = MessageBuilder.build_screen_info(current_app)
text_content = f"** Screen Info **\n\n{screen_info}"
self._context.append(
MessageBuilder.create_user_message(
text=text_content, image_base64=screenshot.base64_data
)
)
# Get model response
try:
response = self.model_client.request(self._context)
except Exception as e:
if self.agent_config.verbose:
traceback.print_exc()
return StepResult(
success=False,
finished=True,
action=None,
thinking="",
message=f"Model error: {e}",
step_count=self._step_count,
)
# Parse action from response
try:
action = parse_action(response.action)
except ValueError:
if self.agent_config.verbose:
traceback.print_exc()
action = finish(message=response.action)
if self.agent_config.verbose:
# Print thinking process
msgs = get_messages(self.agent_config.lang)
print("\n" + "=" * 50)
print(f"š {msgs['thinking']}:")
print("-" * 50)
print(response.thinking)
print("-" * 50)
print(f"šÆ {msgs['action']}:")
print(json.dumps(action, ensure_ascii=False, indent=2))
print("=" * 50 + "\n")
# Remove image from context to save space
self._context[-1] = MessageBuilder.remove_images_from_message(self._context[-1])
# Before action callback - allow modifying or intercepting action
if self.agent_config.before_action_callback is not None:
try:
modified_action = self.agent_config.before_action_callback(action)
if modified_action is not None:
action = modified_action
except Exception as e:
if self.agent_config.verbose:
print(f"Warning: before_action_callback error: {e}")
# Execute action
try:
result = self.action_handler.execute(
action, screenshot.width, screenshot.height
)
except Exception as e:
if self.agent_config.verbose:
traceback.print_exc()
result = self.action_handler.execute(
finish(message=str(e)), screenshot.width, screenshot.height
)
# Add assistant response to context
self._context.append(
MessageBuilder.create_assistant_message(
f"{response.thinking}{response.action}"
)
)
# Check if finished
finished = action.get("_metadata") == "finish" or result.should_finish
if finished and self.agent_config.verbose:
msgs = get_messages(self.agent_config.lang)
print("\n" + "š " + "=" * 48)
print(
f"ā
{msgs['task_completed']}: {result.message or action.get('message', msgs['done'])}"
)
print("=" * 50 + "\n")
# Build step result
step_result = StepResult(
success=result.success,
finished=finished,
action=action,
thinking=response.thinking,
message=result.message or action.get("message"),
step_count=self._step_count,
)
# Step callback - allow interrupting or switching tasks
if self.agent_config.step_callback is not None and not finished:
try:
callback_result = self.agent_config.step_callback(step_result)
if callback_result == "stop":
# Interrupt the task
if self.agent_config.verbose:
print("\nā¹ Task interrupted by callback\n")
step_result.finished = True
return step_result
elif isinstance(callback_result, str):
# Switch to new task
if self.agent_config.verbose:
print(f"\nš Switching to new task: {callback_result}\n")
self.reset()
return self._execute_step(callback_result, is_first=True)
except Exception as e:
if self.agent_config.verbose:
print(f"Warning: step_callback error: {e}")
return step_result
@property
def context(self) -> list[dict[str, Any]]:
"""Get the current conversation context."""
return self._context.copy()
@property
def step_count(self) -> int:
"""Get the current step count."""
return self._step_count