281 lines
10 KiB
Python
281 lines
10 KiB
Python
"""Action handler for iOS automation using WebDriverAgent."""
|
|
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Any, Callable
|
|
|
|
from phone_agent.xctest import (
|
|
back,
|
|
double_tap,
|
|
home,
|
|
launch_app,
|
|
long_press,
|
|
swipe,
|
|
tap,
|
|
)
|
|
from phone_agent.xctest.input import clear_text, hide_keyboard, type_text
|
|
|
|
|
|
@dataclass
|
|
class ActionResult:
|
|
"""Result of an action execution."""
|
|
|
|
success: bool
|
|
should_finish: bool
|
|
message: str | None = None
|
|
requires_confirmation: bool = False
|
|
|
|
|
|
class IOSActionHandler:
|
|
"""
|
|
Handles execution of actions from AI model output for iOS devices.
|
|
|
|
Args:
|
|
wda_url: WebDriverAgent URL.
|
|
session_id: Optional WDA session ID.
|
|
confirmation_callback: Optional callback for sensitive action confirmation.
|
|
Should return True to proceed, False to cancel.
|
|
takeover_callback: Optional callback for takeover requests (login, captcha).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
wda_url: str = "http://localhost:8100",
|
|
session_id: str | None = None,
|
|
confirmation_callback: Callable[[str], bool] | None = None,
|
|
takeover_callback: Callable[[str], None] | None = None,
|
|
):
|
|
self.wda_url = wda_url
|
|
self.session_id = session_id
|
|
self.confirmation_callback = confirmation_callback or self._default_confirmation
|
|
self.takeover_callback = takeover_callback or self._default_takeover
|
|
|
|
def execute(
|
|
self, action: dict[str, Any], screen_width: int, screen_height: int
|
|
) -> ActionResult:
|
|
"""
|
|
Execute an action from the AI model.
|
|
|
|
Args:
|
|
action: The action dictionary from the model.
|
|
screen_width: Current screen width in pixels.
|
|
screen_height: Current screen height in pixels.
|
|
|
|
Returns:
|
|
ActionResult indicating success and whether to finish.
|
|
"""
|
|
action_type = action.get("_metadata")
|
|
|
|
if action_type == "finish":
|
|
return ActionResult(
|
|
success=True, should_finish=True, message=action.get("message")
|
|
)
|
|
|
|
if action_type != "do":
|
|
return ActionResult(
|
|
success=False,
|
|
should_finish=True,
|
|
message=f"Unknown action type: {action_type}",
|
|
)
|
|
|
|
action_name = action.get("action")
|
|
handler_method = self._get_handler(action_name)
|
|
|
|
if handler_method is None:
|
|
return ActionResult(
|
|
success=False,
|
|
should_finish=False,
|
|
message=f"Unknown action: {action_name}",
|
|
)
|
|
|
|
try:
|
|
return handler_method(action, screen_width, screen_height)
|
|
except Exception as e:
|
|
return ActionResult(
|
|
success=False, should_finish=False, message=f"Action failed: {e}"
|
|
)
|
|
|
|
def _get_handler(self, action_name: str) -> Callable | None:
|
|
"""Get the handler method for an action."""
|
|
handlers = {
|
|
"Launch": self._handle_launch,
|
|
"Tap": self._handle_tap,
|
|
"Type": self._handle_type,
|
|
"Type_Name": self._handle_type,
|
|
"Swipe": self._handle_swipe,
|
|
"Back": self._handle_back,
|
|
"Home": self._handle_home,
|
|
"Double Tap": self._handle_double_tap,
|
|
"Long Press": self._handle_long_press,
|
|
"Wait": self._handle_wait,
|
|
"Take_over": self._handle_takeover,
|
|
"Note": self._handle_note,
|
|
"Call_API": self._handle_call_api,
|
|
"Interact": self._handle_interact,
|
|
}
|
|
return handlers.get(action_name)
|
|
|
|
def _convert_relative_to_absolute(
|
|
self, element: list[int], screen_width: int, screen_height: int
|
|
) -> tuple[int, int]:
|
|
"""Convert relative coordinates (0-1000) to absolute pixels."""
|
|
x = int(element[0] / 1000 * screen_width)
|
|
y = int(element[1] / 1000 * screen_height)
|
|
return x, y
|
|
|
|
def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle app launch action."""
|
|
app_name = action.get("app")
|
|
if not app_name:
|
|
return ActionResult(False, False, "No app name specified")
|
|
|
|
success = launch_app(
|
|
app_name, wda_url=self.wda_url, session_id=self.session_id
|
|
)
|
|
if success:
|
|
return ActionResult(True, False)
|
|
return ActionResult(False, False, f"App not found: {app_name}")
|
|
|
|
def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle tap action."""
|
|
element = action.get("element")
|
|
if not element:
|
|
return ActionResult(False, False, "No element coordinates")
|
|
|
|
x, y = self._convert_relative_to_absolute(element, width, height)
|
|
|
|
print(f"Physically tap on ({x}, {y})")
|
|
|
|
# Check for sensitive operation
|
|
if "message" in action:
|
|
if not self.confirmation_callback(action["message"]):
|
|
return ActionResult(
|
|
success=False,
|
|
should_finish=True,
|
|
message="User cancelled sensitive operation",
|
|
)
|
|
|
|
tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle text input action."""
|
|
text = action.get("text", "")
|
|
|
|
# Clear existing text and type new text
|
|
clear_text(wda_url=self.wda_url, session_id=self.session_id)
|
|
time.sleep(0.5)
|
|
|
|
type_text(text, wda_url=self.wda_url, session_id=self.session_id)
|
|
time.sleep(0.5)
|
|
|
|
# Hide keyboard after typing
|
|
hide_keyboard(wda_url=self.wda_url, session_id=self.session_id)
|
|
time.sleep(0.5)
|
|
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle swipe action."""
|
|
start = action.get("start")
|
|
end = action.get("end")
|
|
|
|
if not start or not end:
|
|
return ActionResult(False, False, "Missing swipe coordinates")
|
|
|
|
start_x, start_y = self._convert_relative_to_absolute(start, width, height)
|
|
end_x, end_y = self._convert_relative_to_absolute(end, width, height)
|
|
|
|
print(f"Physically scroll from ({start_x}, {start_y}) to ({end_x}, {end_y})")
|
|
|
|
swipe(
|
|
start_x,
|
|
start_y,
|
|
end_x,
|
|
end_y,
|
|
wda_url=self.wda_url,
|
|
session_id=self.session_id,
|
|
)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle back gesture (swipe from left edge)."""
|
|
back(wda_url=self.wda_url, session_id=self.session_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle home button action."""
|
|
home(wda_url=self.wda_url, session_id=self.session_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle double tap action."""
|
|
element = action.get("element")
|
|
if not element:
|
|
return ActionResult(False, False, "No element coordinates")
|
|
|
|
x, y = self._convert_relative_to_absolute(element, width, height)
|
|
double_tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle long press action."""
|
|
element = action.get("element")
|
|
if not element:
|
|
return ActionResult(False, False, "No element coordinates")
|
|
|
|
x, y = self._convert_relative_to_absolute(element, width, height)
|
|
long_press(
|
|
x,
|
|
y,
|
|
duration=3.0,
|
|
wda_url=self.wda_url,
|
|
session_id=self.session_id,
|
|
)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle wait action."""
|
|
duration_str = action.get("duration", "1 seconds")
|
|
try:
|
|
duration = float(duration_str.replace("seconds", "").strip())
|
|
except ValueError:
|
|
duration = 1.0
|
|
|
|
time.sleep(duration)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle takeover request (login, captcha, etc.)."""
|
|
message = action.get("message", "User intervention required")
|
|
self.takeover_callback(message)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle note action (placeholder for content recording)."""
|
|
# This action is typically used for recording page content
|
|
# Implementation depends on specific requirements
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle API call action (placeholder for summarization)."""
|
|
# This action is typically used for content summarization
|
|
# Implementation depends on specific requirements
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle interaction request (user choice needed)."""
|
|
# This action signals that user input is needed
|
|
return ActionResult(True, False, message="User interaction required")
|
|
|
|
@staticmethod
|
|
def _default_confirmation(message: str) -> bool:
|
|
"""Default confirmation callback using console input."""
|
|
response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
|
|
return response.upper() == "Y"
|
|
|
|
@staticmethod
|
|
def _default_takeover(message: str) -> None:
|
|
"""Default takeover callback using console input."""
|
|
input(f"{message}\nPress Enter after completing manual operation...")
|