- 在hdc/input.py中实现多行文本支持,使用HarmonyOS keyEvent 2054处理换行 - 移除type_text函数的x/y坐标参数,简化接口 - 将多行文本处理逻辑从handler.py移至hdc/input.py,统一处理 - 优化parse_action函数,支持Type动作的text参数提取 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
400 lines
15 KiB
Python
400 lines
15 KiB
Python
"""Action handler for processing AI model outputs."""
|
|
|
|
import ast
|
|
import re
|
|
import subprocess
|
|
import time
|
|
from dataclasses import dataclass
|
|
from typing import Any, Callable
|
|
|
|
from phone_agent.config.timing import TIMING_CONFIG
|
|
from phone_agent.device_factory import get_device_factory
|
|
|
|
|
|
@dataclass
|
|
class ActionResult:
|
|
"""Result of an action execution."""
|
|
|
|
success: bool
|
|
should_finish: bool
|
|
message: str | None = None
|
|
requires_confirmation: bool = False
|
|
|
|
|
|
class ActionHandler:
|
|
"""
|
|
Handles execution of actions from AI model output.
|
|
|
|
Args:
|
|
device_id: Optional ADB device ID for multi-device setups.
|
|
confirmation_callback: Optional callback for sensitive action confirmation.
|
|
Should return True to proceed, False to cancel.
|
|
takeover_callback: Optional callback for takeover requests (login, captcha).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
device_id: str | None = None,
|
|
confirmation_callback: Callable[[str], bool] | None = None,
|
|
takeover_callback: Callable[[str], None] | None = None,
|
|
):
|
|
self.device_id = device_id
|
|
self.confirmation_callback = confirmation_callback or self._default_confirmation
|
|
self.takeover_callback = takeover_callback or self._default_takeover
|
|
|
|
def execute(
|
|
self, action: dict[str, Any], screen_width: int, screen_height: int
|
|
) -> ActionResult:
|
|
"""
|
|
Execute an action from the AI model.
|
|
|
|
Args:
|
|
action: The action dictionary from the model.
|
|
screen_width: Current screen width in pixels.
|
|
screen_height: Current screen height in pixels.
|
|
|
|
Returns:
|
|
ActionResult indicating success and whether to finish.
|
|
"""
|
|
action_type = action.get("_metadata")
|
|
|
|
if action_type == "finish":
|
|
return ActionResult(
|
|
success=True, should_finish=True, message=action.get("message")
|
|
)
|
|
|
|
if action_type != "do":
|
|
return ActionResult(
|
|
success=False,
|
|
should_finish=True,
|
|
message=f"Unknown action type: {action_type}",
|
|
)
|
|
|
|
action_name = action.get("action")
|
|
handler_method = self._get_handler(action_name)
|
|
|
|
if handler_method is None:
|
|
return ActionResult(
|
|
success=False,
|
|
should_finish=False,
|
|
message=f"Unknown action: {action_name}",
|
|
)
|
|
|
|
try:
|
|
return handler_method(action, screen_width, screen_height)
|
|
except Exception as e:
|
|
return ActionResult(
|
|
success=False, should_finish=False, message=f"Action failed: {e}"
|
|
)
|
|
|
|
def _get_handler(self, action_name: str) -> Callable | None:
|
|
"""Get the handler method for an action."""
|
|
handlers = {
|
|
"Launch": self._handle_launch,
|
|
"Tap": self._handle_tap,
|
|
"Type": self._handle_type,
|
|
"Type_Name": self._handle_type,
|
|
"Swipe": self._handle_swipe,
|
|
"Back": self._handle_back,
|
|
"Home": self._handle_home,
|
|
"Double Tap": self._handle_double_tap,
|
|
"Long Press": self._handle_long_press,
|
|
"Wait": self._handle_wait,
|
|
"Take_over": self._handle_takeover,
|
|
"Note": self._handle_note,
|
|
"Call_API": self._handle_call_api,
|
|
"Interact": self._handle_interact,
|
|
}
|
|
return handlers.get(action_name)
|
|
|
|
def _convert_relative_to_absolute(
|
|
self, element: list[int], screen_width: int, screen_height: int
|
|
) -> tuple[int, int]:
|
|
"""Convert relative coordinates (0-1000) to absolute pixels."""
|
|
x = int(element[0] / 1000 * screen_width)
|
|
y = int(element[1] / 1000 * screen_height)
|
|
return x, y
|
|
|
|
def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle app launch action."""
|
|
app_name = action.get("app")
|
|
if not app_name:
|
|
return ActionResult(False, False, "No app name specified")
|
|
|
|
device_factory = get_device_factory()
|
|
success = device_factory.launch_app(app_name, self.device_id)
|
|
if success:
|
|
return ActionResult(True, False)
|
|
return ActionResult(False, False, f"App not found: {app_name}")
|
|
|
|
def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle tap action."""
|
|
element = action.get("element")
|
|
if not element:
|
|
return ActionResult(False, False, "No element coordinates")
|
|
|
|
x, y = self._convert_relative_to_absolute(element, width, height)
|
|
|
|
# Check for sensitive operation
|
|
if "message" in action:
|
|
if not self.confirmation_callback(action["message"]):
|
|
return ActionResult(
|
|
success=False,
|
|
should_finish=True,
|
|
message="User cancelled sensitive operation",
|
|
)
|
|
|
|
device_factory = get_device_factory()
|
|
device_factory.tap(x, y, self.device_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle text input action."""
|
|
text = action.get("text", "")
|
|
|
|
device_factory = get_device_factory()
|
|
|
|
# Switch to ADB keyboard
|
|
original_ime = device_factory.detect_and_set_adb_keyboard(self.device_id)
|
|
time.sleep(TIMING_CONFIG.action.keyboard_switch_delay)
|
|
|
|
# Clear existing text and type new text
|
|
device_factory.clear_text(self.device_id)
|
|
time.sleep(TIMING_CONFIG.action.text_clear_delay)
|
|
|
|
# Handle multiline text by splitting on newlines
|
|
device_factory.type_text(text, self.device_id)
|
|
time.sleep(TIMING_CONFIG.action.text_input_delay)
|
|
|
|
# Restore original keyboard
|
|
device_factory.restore_keyboard(original_ime, self.device_id)
|
|
time.sleep(TIMING_CONFIG.action.keyboard_restore_delay)
|
|
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle swipe action."""
|
|
start = action.get("start")
|
|
end = action.get("end")
|
|
|
|
if not start or not end:
|
|
return ActionResult(False, False, "Missing swipe coordinates")
|
|
|
|
start_x, start_y = self._convert_relative_to_absolute(start, width, height)
|
|
end_x, end_y = self._convert_relative_to_absolute(end, width, height)
|
|
|
|
device_factory = get_device_factory()
|
|
device_factory.swipe(start_x, start_y, end_x, end_y, device_id=self.device_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle back button action."""
|
|
device_factory = get_device_factory()
|
|
device_factory.back(self.device_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle home button action."""
|
|
device_factory = get_device_factory()
|
|
device_factory.home(self.device_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle double tap action."""
|
|
element = action.get("element")
|
|
if not element:
|
|
return ActionResult(False, False, "No element coordinates")
|
|
|
|
x, y = self._convert_relative_to_absolute(element, width, height)
|
|
device_factory = get_device_factory()
|
|
device_factory.double_tap(x, y, self.device_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle long press action."""
|
|
element = action.get("element")
|
|
if not element:
|
|
return ActionResult(False, False, "No element coordinates")
|
|
|
|
x, y = self._convert_relative_to_absolute(element, width, height)
|
|
device_factory = get_device_factory()
|
|
device_factory.long_press(x, y, device_id=self.device_id)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle wait action."""
|
|
duration_str = action.get("duration", "1 seconds")
|
|
try:
|
|
duration = float(duration_str.replace("seconds", "").strip())
|
|
except ValueError:
|
|
duration = 1.0
|
|
|
|
time.sleep(duration)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle takeover request (login, captcha, etc.)."""
|
|
message = action.get("message", "User intervention required")
|
|
self.takeover_callback(message)
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle note action (placeholder for content recording)."""
|
|
# This action is typically used for recording page content
|
|
# Implementation depends on specific requirements
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle API call action (placeholder for summarization)."""
|
|
# This action is typically used for content summarization
|
|
# Implementation depends on specific requirements
|
|
return ActionResult(True, False)
|
|
|
|
def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
|
|
"""Handle interaction request (user choice needed)."""
|
|
# This action signals that user input is needed
|
|
return ActionResult(True, False, message="User interaction required")
|
|
|
|
def _send_keyevent(self, keycode: str) -> None:
|
|
"""Send a keyevent to the device."""
|
|
from phone_agent.device_factory import DeviceType, get_device_factory
|
|
from phone_agent.hdc.connection import _run_hdc_command
|
|
|
|
device_factory = get_device_factory()
|
|
|
|
# Handle HDC devices with HarmonyOS-specific keyEvent command
|
|
if device_factory.device_type == DeviceType.HDC:
|
|
hdc_prefix = ["hdc", "-t", self.device_id] if self.device_id else ["hdc"]
|
|
|
|
# Map common keycodes to HarmonyOS keyEvent codes
|
|
# KEYCODE_ENTER (66) -> 2054 (HarmonyOS Enter key code)
|
|
if keycode == "KEYCODE_ENTER" or keycode == "66":
|
|
_run_hdc_command(
|
|
hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
else:
|
|
# For other keys, try to use the numeric code directly
|
|
# If keycode is a string like "KEYCODE_ENTER", convert it
|
|
try:
|
|
# Try to extract numeric code from string or use as-is
|
|
if keycode.startswith("KEYCODE_"):
|
|
# For now, only handle ENTER, other keys may need mapping
|
|
if "ENTER" in keycode:
|
|
_run_hdc_command(
|
|
hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
else:
|
|
# Fallback to ADB-style command for unsupported keys
|
|
subprocess.run(
|
|
hdc_prefix + ["shell", "input", "keyevent", keycode],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
else:
|
|
# Assume it's a numeric code
|
|
_run_hdc_command(
|
|
hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
except Exception:
|
|
# Fallback to ADB-style command
|
|
subprocess.run(
|
|
hdc_prefix + ["shell", "input", "keyevent", keycode],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
else:
|
|
# ADB devices use standard input keyevent command
|
|
cmd_prefix = ["adb", "-s", self.device_id] if self.device_id else ["adb"]
|
|
subprocess.run(
|
|
cmd_prefix + ["shell", "input", "keyevent", keycode],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
@staticmethod
|
|
def _default_confirmation(message: str) -> bool:
|
|
"""Default confirmation callback using console input."""
|
|
response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
|
|
return response.upper() == "Y"
|
|
|
|
@staticmethod
|
|
def _default_takeover(message: str) -> None:
|
|
"""Default takeover callback using console input."""
|
|
input(f"{message}\nPress Enter after completing manual operation...")
|
|
|
|
|
|
def parse_action(response: str) -> dict[str, Any]:
|
|
"""
|
|
Parse action from model response.
|
|
|
|
Args:
|
|
response: Raw response string from the model.
|
|
|
|
Returns:
|
|
Parsed action dictionary.
|
|
|
|
Raises:
|
|
ValueError: If the response cannot be parsed.
|
|
"""
|
|
print(f"Parsing action: {response}")
|
|
try:
|
|
response = response.strip()
|
|
if response.startswith('do(action="Type"') or response.startswith(
|
|
'do(action="Type_Name"'
|
|
):
|
|
text = response.split("text=", 1)[1][1:-2]
|
|
action = {"_metadata": "do", "action": "Type", "text": text}
|
|
return action
|
|
elif response.startswith("do"):
|
|
# Use AST parsing instead of eval for safety
|
|
try:
|
|
# Escape special characters (newlines, tabs, etc.) for valid Python syntax
|
|
response = response.replace('\n', '\\n')
|
|
response = response.replace('\r', '\\r')
|
|
response = response.replace('\t', '\\t')
|
|
|
|
tree = ast.parse(response, mode="eval")
|
|
if not isinstance(tree.body, ast.Call):
|
|
raise ValueError("Expected a function call")
|
|
|
|
call = tree.body
|
|
# Extract keyword arguments safely
|
|
action = {"_metadata": "do"}
|
|
for keyword in call.keywords:
|
|
key = keyword.arg
|
|
value = ast.literal_eval(keyword.value)
|
|
action[key] = value
|
|
|
|
return action
|
|
except (SyntaxError, ValueError) as e:
|
|
raise ValueError(f"Failed to parse do() action: {e}")
|
|
|
|
elif response.startswith("finish"):
|
|
action = {
|
|
"_metadata": "finish",
|
|
"message": response.replace("finish(message=", "")[1:-2],
|
|
}
|
|
else:
|
|
raise ValueError(f"Failed to parse action: {response}")
|
|
return action
|
|
except Exception as e:
|
|
raise ValueError(f"Failed to parse action: {e}")
|
|
|
|
|
|
def do(**kwargs) -> dict[str, Any]:
|
|
"""Helper function for creating 'do' actions."""
|
|
kwargs["_metadata"] = "do"
|
|
return kwargs
|
|
|
|
|
|
def finish(**kwargs) -> dict[str, Any]:
|
|
"""Helper function for creating 'finish' actions."""
|
|
kwargs["_metadata"] = "finish"
|
|
return kwargs
|