feat: Added iOS support
This commit is contained in:
280
phone_agent/actions/handler_ios.py
Normal file
280
phone_agent/actions/handler_ios.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""Action handler for iOS automation using WebDriverAgent."""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable
|
||||
|
||||
from phone_agent.xctest import (
|
||||
back,
|
||||
double_tap,
|
||||
home,
|
||||
launch_app,
|
||||
long_press,
|
||||
swipe,
|
||||
tap,
|
||||
)
|
||||
from phone_agent.xctest.input import clear_text, hide_keyboard, type_text
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionResult:
|
||||
"""Result of an action execution."""
|
||||
|
||||
success: bool
|
||||
should_finish: bool
|
||||
message: str | None = None
|
||||
requires_confirmation: bool = False
|
||||
|
||||
|
||||
class IOSActionHandler:
|
||||
"""
|
||||
Handles execution of actions from AI model output for iOS devices.
|
||||
|
||||
Args:
|
||||
wda_url: WebDriverAgent URL.
|
||||
session_id: Optional WDA session ID.
|
||||
confirmation_callback: Optional callback for sensitive action confirmation.
|
||||
Should return True to proceed, False to cancel.
|
||||
takeover_callback: Optional callback for takeover requests (login, captcha).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
wda_url: str = "http://localhost:8100",
|
||||
session_id: str | None = None,
|
||||
confirmation_callback: Callable[[str], bool] | None = None,
|
||||
takeover_callback: Callable[[str], None] | None = None,
|
||||
):
|
||||
self.wda_url = wda_url
|
||||
self.session_id = session_id
|
||||
self.confirmation_callback = confirmation_callback or self._default_confirmation
|
||||
self.takeover_callback = takeover_callback or self._default_takeover
|
||||
|
||||
def execute(
|
||||
self, action: dict[str, Any], screen_width: int, screen_height: int
|
||||
) -> ActionResult:
|
||||
"""
|
||||
Execute an action from the AI model.
|
||||
|
||||
Args:
|
||||
action: The action dictionary from the model.
|
||||
screen_width: Current screen width in pixels.
|
||||
screen_height: Current screen height in pixels.
|
||||
|
||||
Returns:
|
||||
ActionResult indicating success and whether to finish.
|
||||
"""
|
||||
action_type = action.get("_metadata")
|
||||
|
||||
if action_type == "finish":
|
||||
return ActionResult(
|
||||
success=True, should_finish=True, message=action.get("message")
|
||||
)
|
||||
|
||||
if action_type != "do":
|
||||
return ActionResult(
|
||||
success=False,
|
||||
should_finish=True,
|
||||
message=f"Unknown action type: {action_type}",
|
||||
)
|
||||
|
||||
action_name = action.get("action")
|
||||
handler_method = self._get_handler(action_name)
|
||||
|
||||
if handler_method is None:
|
||||
return ActionResult(
|
||||
success=False,
|
||||
should_finish=False,
|
||||
message=f"Unknown action: {action_name}",
|
||||
)
|
||||
|
||||
try:
|
||||
return handler_method(action, screen_width, screen_height)
|
||||
except Exception as e:
|
||||
return ActionResult(
|
||||
success=False, should_finish=False, message=f"Action failed: {e}"
|
||||
)
|
||||
|
||||
def _get_handler(self, action_name: str) -> Callable | None:
|
||||
"""Get the handler method for an action."""
|
||||
handlers = {
|
||||
"Launch": self._handle_launch,
|
||||
"Tap": self._handle_tap,
|
||||
"Type": self._handle_type,
|
||||
"Type_Name": self._handle_type,
|
||||
"Swipe": self._handle_swipe,
|
||||
"Back": self._handle_back,
|
||||
"Home": self._handle_home,
|
||||
"Double Tap": self._handle_double_tap,
|
||||
"Long Press": self._handle_long_press,
|
||||
"Wait": self._handle_wait,
|
||||
"Take_over": self._handle_takeover,
|
||||
"Note": self._handle_note,
|
||||
"Call_API": self._handle_call_api,
|
||||
"Interact": self._handle_interact,
|
||||
}
|
||||
return handlers.get(action_name)
|
||||
|
||||
def _convert_relative_to_absolute(
|
||||
self, element: list[int], screen_width: int, screen_height: int
|
||||
) -> tuple[int, int]:
|
||||
"""Convert relative coordinates (0-1000) to absolute pixels."""
|
||||
x = int(element[0] / 1000 * screen_width)
|
||||
y = int(element[1] / 1000 * screen_height)
|
||||
return x, y
|
||||
|
||||
def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle app launch action."""
|
||||
app_name = action.get("app")
|
||||
if not app_name:
|
||||
return ActionResult(False, False, "No app name specified")
|
||||
|
||||
success = launch_app(
|
||||
app_name, wda_url=self.wda_url, session_id=self.session_id
|
||||
)
|
||||
if success:
|
||||
return ActionResult(True, False)
|
||||
return ActionResult(False, False, f"App not found: {app_name}")
|
||||
|
||||
def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle tap action."""
|
||||
element = action.get("element")
|
||||
if not element:
|
||||
return ActionResult(False, False, "No element coordinates")
|
||||
|
||||
x, y = self._convert_relative_to_absolute(element, width, height)
|
||||
|
||||
print(f"Physically tap on ({x}, {y})")
|
||||
|
||||
# Check for sensitive operation
|
||||
if "message" in action:
|
||||
if not self.confirmation_callback(action["message"]):
|
||||
return ActionResult(
|
||||
success=False,
|
||||
should_finish=True,
|
||||
message="User cancelled sensitive operation",
|
||||
)
|
||||
|
||||
tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle text input action."""
|
||||
text = action.get("text", "")
|
||||
|
||||
# Clear existing text and type new text
|
||||
clear_text(wda_url=self.wda_url, session_id=self.session_id)
|
||||
time.sleep(0.5)
|
||||
|
||||
type_text(text, wda_url=self.wda_url, session_id=self.session_id)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Hide keyboard after typing
|
||||
hide_keyboard(wda_url=self.wda_url, session_id=self.session_id)
|
||||
time.sleep(0.5)
|
||||
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle swipe action."""
|
||||
start = action.get("start")
|
||||
end = action.get("end")
|
||||
|
||||
if not start or not end:
|
||||
return ActionResult(False, False, "Missing swipe coordinates")
|
||||
|
||||
start_x, start_y = self._convert_relative_to_absolute(start, width, height)
|
||||
end_x, end_y = self._convert_relative_to_absolute(end, width, height)
|
||||
|
||||
print(f"Physically scroll from ({start_x}, {start_y}) to ({end_x}, {end_y})")
|
||||
|
||||
swipe(
|
||||
start_x,
|
||||
start_y,
|
||||
end_x,
|
||||
end_y,
|
||||
wda_url=self.wda_url,
|
||||
session_id=self.session_id,
|
||||
)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle back gesture (swipe from left edge)."""
|
||||
back(wda_url=self.wda_url, session_id=self.session_id)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle home button action."""
|
||||
home(wda_url=self.wda_url, session_id=self.session_id)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle double tap action."""
|
||||
element = action.get("element")
|
||||
if not element:
|
||||
return ActionResult(False, False, "No element coordinates")
|
||||
|
||||
x, y = self._convert_relative_to_absolute(element, width, height)
|
||||
double_tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle long press action."""
|
||||
element = action.get("element")
|
||||
if not element:
|
||||
return ActionResult(False, False, "No element coordinates")
|
||||
|
||||
x, y = self._convert_relative_to_absolute(element, width, height)
|
||||
long_press(
|
||||
x,
|
||||
y,
|
||||
duration=3.0,
|
||||
wda_url=self.wda_url,
|
||||
session_id=self.session_id,
|
||||
)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle wait action."""
|
||||
duration_str = action.get("duration", "1 seconds")
|
||||
try:
|
||||
duration = float(duration_str.replace("seconds", "").strip())
|
||||
except ValueError:
|
||||
duration = 1.0
|
||||
|
||||
time.sleep(duration)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle takeover request (login, captcha, etc.)."""
|
||||
message = action.get("message", "User intervention required")
|
||||
self.takeover_callback(message)
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle note action (placeholder for content recording)."""
|
||||
# This action is typically used for recording page content
|
||||
# Implementation depends on specific requirements
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle API call action (placeholder for summarization)."""
|
||||
# This action is typically used for content summarization
|
||||
# Implementation depends on specific requirements
|
||||
return ActionResult(True, False)
|
||||
|
||||
def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
|
||||
"""Handle interaction request (user choice needed)."""
|
||||
# This action signals that user input is needed
|
||||
return ActionResult(True, False, message="User interaction required")
|
||||
|
||||
@staticmethod
|
||||
def _default_confirmation(message: str) -> bool:
|
||||
"""Default confirmation callback using console input."""
|
||||
response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
|
||||
return response.upper() == "Y"
|
||||
|
||||
@staticmethod
|
||||
def _default_takeover(message: str) -> None:
|
||||
"""Default takeover callback using console input."""
|
||||
input(f"{message}\nPress Enter after completing manual operation...")
|
||||
Reference in New Issue
Block a user