This commit is contained in:
zRzRzRzRzRzRzR
2025-12-09 23:30:39 +08:00
parent 7aeaca076f
commit d4fbb4649c
16 changed files with 789 additions and 236 deletions

View File

@@ -3,6 +3,7 @@
import base64
import os
import subprocess
import tempfile
import uuid
from dataclasses import dataclass
from io import BytesIO
@@ -36,7 +37,7 @@ def get_screenshot(device_id: str | None = None, timeout: int = 10) -> Screensho
If the screenshot fails (e.g., on sensitive screens like payment pages),
a black fallback image is returned with is_sensitive=True.
"""
temp_path = f"/tmp/screenshot_{uuid.uuid4()}.png"
temp_path = os.path.join(tempfile.gettempdir(), f"screenshot_{uuid.uuid4()}.png")
adb_prefix = _get_adb_prefix(device_id)
try:

View File

@@ -8,7 +8,7 @@ from typing import Any, Callable
from phone_agent.actions import ActionHandler
from phone_agent.actions.handler import do, finish, parse_action
from phone_agent.adb import get_current_app, get_screenshot
from phone_agent.config import SYSTEM_PROMPT
from phone_agent.config import get_messages, get_system_prompt
from phone_agent.model import ModelClient, ModelConfig
from phone_agent.model.client import MessageBuilder
@@ -19,9 +19,14 @@ class AgentConfig:
max_steps: int = 100
device_id: str | None = None
system_prompt: str = SYSTEM_PROMPT
lang: str = "cn"
system_prompt: str | None = None
verbose: bool = True
def __post_init__(self):
if self.system_prompt is None:
self.system_prompt = get_system_prompt(self.lang)
@dataclass
class StepResult:
@@ -185,13 +190,14 @@ class PhoneAgent:
action = finish(message=response.action)
if self.agent_config.verbose:
# 打印思考过程
# Print thinking process
msgs = get_messages(self.agent_config.lang)
print("\n" + "=" * 50)
print("💭 思考过程:")
print(f"💭 {msgs['thinking']}:")
print("-" * 50)
print(response.thinking)
print("-" * 50)
print("🎯 执行动作:")
print(f"🎯 {msgs['action']}:")
print(json.dumps(action, ensure_ascii=False, indent=2))
print("=" * 50 + "\n")
@@ -221,8 +227,11 @@ class PhoneAgent:
finished = action.get("_metadata") == "finish" or result.should_finish
if finished and self.agent_config.verbose:
msgs = get_messages(self.agent_config.lang)
print("\n" + "🎉 " + "=" * 48)
print(f"✅ 任务完成: {result.message or action.get('message', '完成')}")
print(
f"{msgs['task_completed']}: {result.message or action.get('message', msgs['done'])}"
)
print("=" * 50 + "\n")
return StepResult(

View File

@@ -1,6 +1,35 @@
"""Configuration module for Phone Agent."""
from phone_agent.config.apps import APP_PACKAGES
from phone_agent.config.prompts import SYSTEM_PROMPT
from phone_agent.config.i18n import get_message, get_messages
from phone_agent.config.prompts_en import SYSTEM_PROMPT as SYSTEM_PROMPT_EN
from phone_agent.config.prompts_zh import SYSTEM_PROMPT as SYSTEM_PROMPT_ZH
__all__ = ["APP_PACKAGES", "SYSTEM_PROMPT"]
def get_system_prompt(lang: str = "cn") -> str:
"""
Get system prompt by language.
Args:
lang: Language code, 'cn' for Chinese, 'en' for English.
Returns:
System prompt string.
"""
if lang == "en":
return SYSTEM_PROMPT_EN
return SYSTEM_PROMPT_ZH
# Default to Chinese for backward compatibility
SYSTEM_PROMPT = SYSTEM_PROMPT_ZH
__all__ = [
"APP_PACKAGES",
"SYSTEM_PROMPT",
"SYSTEM_PROMPT_ZH",
"SYSTEM_PROMPT_EN",
"get_system_prompt",
"get_messages",
"get_message",
]

View File

@@ -68,6 +68,123 @@ APP_PACKAGES: dict[str, str] = {
"星穹铁道": "com.miHoYo.hkrpg",
"崩坏:星穹铁道": "com.miHoYo.hkrpg",
"恋与深空": "com.papegames.lysk.cn",
"AndroidSystemSettings": "com.android.settings",
"Android System Settings": "com.android.settings",
"Android System Settings": "com.android.settings",
"Android-System-Settings": "com.android.settings",
"Settings": "com.android.settings",
"AudioRecorder": "com.android.soundrecorder",
"audiorecorder": "com.android.soundrecorder",
"Bluecoins": "com.rammigsoftware.bluecoins",
"bluecoins": "com.rammigsoftware.bluecoins",
"Broccoli": "com.flauschcode.broccoli",
"broccoli": "com.flauschcode.broccoli",
"Booking.com": "com.booking",
"Booking": "com.booking",
"booking.com": "com.booking",
"booking": "com.booking",
"BOOKING.COM": "com.booking",
"Chrome": "com.android.chrome",
"chrome": "com.android.chrome",
"Google Chrome": "com.android.chrome",
"Clock": "com.android.deskclock",
"clock": "com.android.deskclock",
"Contacts": "com.android.contacts",
"contacts": "com.android.contacts",
"Duolingo": "com.duolingo",
"duolingo": "com.duolingo",
"Expedia": "com.expedia.bookings",
"expedia": "com.expedia.bookings",
"Files": "com.android.fileexplorer",
"files": "com.android.fileexplorer",
"File Manager": "com.android.fileexplorer",
"file manager": "com.android.fileexplorer",
"gmail": "com.google.android.gm",
"Gmail": "com.google.android.gm",
"GoogleMail": "com.google.android.gm",
"Google Mail": "com.google.android.gm",
"GoogleFiles": "com.google.android.apps.nbu.files",
"googlefiles": "com.google.android.apps.nbu.files",
"FilesbyGoogle": "com.google.android.apps.nbu.files",
"GoogleCalendar": "com.google.android.calendar",
"Google-Calendar": "com.google.android.calendar",
"Google Calendar": "com.google.android.calendar",
"google-calendar": "com.google.android.calendar",
"google calendar": "com.google.android.calendar",
"GoogleChat": "com.google.android.apps.dynamite",
"Google Chat": "com.google.android.apps.dynamite",
"Google-Chat": "com.google.android.apps.dynamite",
"GoogleClock": "com.google.android.deskclock",
"Google Clock": "com.google.android.deskclock",
"Google-Clock": "com.google.android.deskclock",
"GoogleContacts": "com.google.android.contacts",
"Google-Contacts": "com.google.android.contacts",
"Google Contacts": "com.google.android.contacts",
"google-contacts": "com.google.android.contacts",
"google contacts": "com.google.android.contacts",
"GoogleDocs": "com.google.android.apps.docs.editors.docs",
"Google Docs": "com.google.android.apps.docs.editors.docs",
"googledocs": "com.google.android.apps.docs.editors.docs",
"google docs": "com.google.android.apps.docs.editors.docs",
"Google Drive": "com.google.android.apps.docs",
"Google-Drive": "com.google.android.apps.docs",
"google drive": "com.google.android.apps.docs",
"google-drive": "com.google.android.apps.docs",
"GoogleDrive": "com.google.android.apps.docs",
"Googledrive": "com.google.android.apps.docs",
"googledrive": "com.google.android.apps.docs",
"GoogleFit": "com.google.android.apps.fitness",
"googlefit": "com.google.android.apps.fitness",
"GoogleKeep": "com.google.android.keep",
"googlekeep": "com.google.android.keep",
"GoogleMaps": "com.google.android.apps.maps",
"Google Maps": "com.google.android.apps.maps",
"googlemaps": "com.google.android.apps.maps",
"google maps": "com.google.android.apps.maps",
"Google Play Books": "com.google.android.apps.books",
"Google-Play-Books": "com.google.android.apps.books",
"google play books": "com.google.android.apps.books",
"google-play-books": "com.google.android.apps.books",
"GooglePlayBooks": "com.google.android.apps.books",
"googleplaybooks": "com.google.android.apps.books",
"GooglePlayStore": "com.android.vending",
"Google Play Store": "com.android.vending",
"Google-Play-Store": "com.android.vending",
"GoogleSlides": "com.google.android.apps.docs.editors.slides",
"Google Slides": "com.google.android.apps.docs.editors.slides",
"Google-Slides": "com.google.android.apps.docs.editors.slides",
"GoogleTasks": "com.google.android.apps.tasks",
"Google Tasks": "com.google.android.apps.tasks",
"Google-Tasks": "com.google.android.apps.tasks",
"Joplin": "net.cozic.joplin",
"joplin": "net.cozic.joplin",
"McDonald": "com.mcdonalds.app",
"mcdonald": "com.mcdonalds.app",
"Osmand": "net.osmand",
"osmand": "net.osmand",
"PiMusicPlayer": "com.Project100Pi.themusicplayer",
"pimusicplayer": "com.Project100Pi.themusicplayer",
"Quora": "com.quora.android",
"quora": "com.quora.android",
"Reddit": "com.reddit.frontpage",
"reddit": "com.reddit.frontpage",
"RetroMusic": "code.name.monkey.retromusic",
"retromusic": "code.name.monkey.retromusic",
"SimpleCalendarPro": "com.scientificcalculatorplus.simplecalculator.basiccalculator.mathcalc",
"SimpleSMSMessenger": "com.simplemobiletools.smsmessenger",
"Telegram": "org.telegram.messenger",
"temu": "com.einnovation.temu",
"Temu": "com.einnovation.temu",
"Tiktok": "com.zhiliaoapp.musically",
"tiktok": "com.zhiliaoapp.musically",
"Twitter": "com.twitter.android",
"twitter": "com.twitter.android",
"X": "com.twitter.android",
"VLC": "org.videolan.vlc",
"WeChat": "com.tencent.mm",
"wechat": "com.tencent.mm",
"Whatsapp": "com.whatsapp",
"WhatsApp": "com.whatsapp",
}

View File

@@ -0,0 +1,73 @@
"""Internationalization (i18n) module for Phone Agent UI messages."""
# Chinese messages
MESSAGES_ZH = {
"thinking": "思考过程",
"action": "执行动作",
"task_completed": "任务完成",
"done": "完成",
"starting_task": "开始执行任务",
"final_result": "最终结果",
"task_result": "任务结果",
"confirmation_required": "需要确认",
"continue_prompt": "是否继续?(y/n)",
"manual_operation_required": "需要人工操作",
"manual_operation_hint": "请手动完成操作...",
"press_enter_when_done": "完成后按回车继续",
"connection_failed": "连接失败",
"connection_successful": "连接成功",
"step": "步骤",
"task": "任务",
"result": "结果",
}
# English messages
MESSAGES_EN = {
"thinking": "Thinking",
"action": "Action",
"task_completed": "Task Completed",
"done": "Done",
"starting_task": "Starting task",
"final_result": "Final Result",
"task_result": "Task Result",
"confirmation_required": "Confirmation Required",
"continue_prompt": "Continue? (y/n)",
"manual_operation_required": "Manual Operation Required",
"manual_operation_hint": "Please complete the operation manually...",
"press_enter_when_done": "Press Enter when done",
"connection_failed": "Connection Failed",
"connection_successful": "Connection Successful",
"step": "Step",
"task": "Task",
"result": "Result",
}
def get_messages(lang: str = "cn") -> dict:
"""
Get UI messages dictionary by language.
Args:
lang: Language code, 'cn' for Chinese, 'en' for English.
Returns:
Dictionary of UI messages.
"""
if lang == "en":
return MESSAGES_EN
return MESSAGES_ZH
def get_message(key: str, lang: str = "cn") -> str:
"""
Get a single UI message by key and language.
Args:
key: Message key.
lang: Language code, 'cn' for Chinese, 'en' for English.
Returns:
Message string.
"""
messages = get_messages(lang)
return messages.get(key, key)

View File

@@ -0,0 +1,74 @@
"""System prompts for the AI agent."""
from datetime import datetime
today = datetime.today()
formatted_date = today.strftime("%Y-%m-%d, %A")
SYSTEM_PROMPT = "The current date: " + formatted_date + '''
# Setup
You are a professional Android operation agent assistant that can fulfill the user's high-level instructions. Given a screenshot of the Android interface at each step, you first analyze the situation, then plan the best course of action using Python-style pseudo-code.
# More details about the code
Your response format must be structured as follows:
Think first: Use <think>...</think> to analyze the current screen, identify key elements, and determine the most efficient action.
Provide the action: Use <answer>...</answer> to return a single line of pseudo-code representing the operation.
Your output should STRICTLY follow the format:
<think>
[Your throught]
</think>
<answer>
[Your operation code]
</answer>
- **Tap**
Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.
**Example**:
<answer>
do(action="Tap", element=[x,y])
</answer>
- **Type**
Enter text into the currently focused input field.
**Example**:
<answer>
do(action="Type", text="Hello World")
</answer>
- **Swipe**
Perform a swipe action with start point and end point.
**Examples**:
<answer>
do(action="Swipe", start=[x1,y1], end=[x2,y2])
</answer>
- **Long Press**
Perform a long press action on a specified screen area.
You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.
**Example**:
<answer>
do(action="Long Press", element=[x,y])
</answer>
- **Launch**
Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.
**Example**:
<answer>
do(action="Launch", app="Settings")
</answer>
- **Back**
Press the Back button to navigate to the previous screen.
**Example**:
<answer>
do(action="Back")
</answer>
- **Finish**
Terminate the program and optionally print a message.
**Example**:
<answer>
finish(message="Task completed.")
</answer>
REMEMBER:
- Think before you act: Always analyze the current UI and the best course of action before executing any step, and output in <think> part.
- Only ONE LINE of action in <answer> part per response: Each step must contain exactly one line of executable code.
- Generate execution code strictly according to format requirements.
'''

View File

@@ -0,0 +1,72 @@
"""System prompts for the AI agent."""
from datetime import datetime
today = datetime.today()
weekday_names = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
weekday = weekday_names[today.weekday()]
formatted_date = today.strftime("%Y年%m月%d") + " " + weekday
SYSTEM_PROMPT = "今天的日期是: " + formatted_date + '''
你是一个智能体分析专家,可以根据操作历史和当前状态图执行一系列操作来完成任务。
你必须严格按照要求输出以下格式:
<think>{think}</think>
<answer>{action}</answer>
其中:
- {think} 是对你为什么选择这个操作的简短推理说明。
- {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。
操作指令及其作用如下:
- do(action="Launch", app="xxx")
Launch是启动目标app的操作这比通过主屏幕导航更快。此操作完成后您将自动收到结果状态的截图。
- do(action="Tap", element=[x,y])
Tap是点击操作点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角999,999)结束。此操作完成后,您将自动收到结果状态的截图。
- do(action="Tap", element=[x,y], message="重要操作")
基本功能同Tap点击涉及财产、支付、隐私等敏感按钮时触发。
- do(action="Type", text="xxx")
Type是输入操作在当前聚焦的输入框中输入文本。使用此操作前请确保输入框已被聚焦先点击它。输入的文本将像使用键盘输入一样输入。重要提示手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。
- do(action="Type_Name", text="xxx")
Type_Name是输入人名的操作基本功能同Type。
- do(action="Interact")
Interact是当有多个满足条件的选项时而触发的交互操作询问用户如何选择。
- do(action="Swipe", start=[x1,y1], end=[x2,y2])
Swipe是滑动操作通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。
- do(action="Note", message="True")
记录当前页面内容以便后续总结。
- do(action="Call_API", instruction="xxx")
总结或评论当前页面或已记录的内容。
- do(action="Long Press", element=[x,y])
Long Pres是长按操作在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。
- do(action="Double Tap", element=[x,y])
Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角999,999)结束。此操作完成后,您将自动收到结果状态的截图。
- do(action="Take_over", message="xxx")
Take_over是接管操作表示在登录和验证阶段需要用户协助。
- do(action="Back")
导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。
- do(action="Home")
Home是回到系统桌面的操作相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。
- do(action="Wait", duration="x seconds")
等待页面加载x为需要等待多少秒。
- finish(message="xxx")
finish是结束任务的操作表示准确完整完成任务message是终止信息。
必须遵循的规则:
1. 在执行任何操作前先检查当前app是否是目标app如果不是先执行 Launch。
2. 如果进入到了无关页面,先执行 Back。如果执行Back后页面没有变化请点击页面左上角的返回键进行返回或者右上角的X号关闭。
3. 如果页面未加载出内容,最多连续 Wait 三次,否则执行 Back重新进入。
4. 如果页面显示网络问题,需要重新加载,请点击重新加载。
5. 如果当前页面找不到目标联系人、商品、店铺等信息,可以尝试 Swipe 滑动查找。
6. 遇到价格区间、时间区间等筛选条件,如果没有完全符合的,可以放宽要求。
7. 在做小红书总结类任务时一定要筛选图文笔记。
8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。
9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。
10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。
11. 请严格遵循用户意图执行任务用户的特殊要求可以执行多次搜索滑动查找。比如i用户要求点一杯咖啡要咸的你可以直接搜索咸咖啡或者搜索咖啡后滑动查找咸的咖啡比如海盐咖啡。ii用户要找到XX群发一条消息你可以先搜索XX群找不到结果后""字去掉搜索XX重试。iii用户要找到宠物友好的餐厅你可以搜索餐厅找到筛选找到设施选择可带宠物或者直接搜索可带宠物必要时可以使用AI搜索。
12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。
13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。
14. 在执行下一步操作前请一定要检查上一步的操作是否生效如果点击没生效可能因为app反应较慢请先稍微等待一下如果还是不生效请调整一下点击位置重试如果仍然不生效请跳过这一步继续任务并在finish message说明点击不生效。
15. 在执行任务中如果遇到滑动不生效的情况请调整一下起始点位置增大滑动距离重试如果还是不生效有可能是已经滑到底了请继续向反方向滑动直到顶部或底部如果仍然没有符合要求的结果请跳过这一步继续任务并在finish message说明但没找到要求的项目。
16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。
17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message="原因")。
18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。
'''