draft init

2025-12-08 23:54:29 +08:00
commit 7e1785e08e
31 changed files with 3639 additions and 0 deletions
--- a/phone_agent/model/client.py
+++ b/phone_agent/model/client.py
@@ -0,0 +1,168 @@
+"""Model client for AI inference using OpenAI-compatible API."""
+
+import json
+from dataclasses import dataclass, field
+from typing import Any
+
+from openai import OpenAI
+
+
+@dataclass
+class ModelConfig:
+    """Configuration for the AI model."""
+
+    base_url: str = "http://localhost:8000/v1"
+    api_key: str = "EMPTY"
+    model_name: str = "autoglm-phone-9b"
+    max_tokens: int = 3000
+    temperature: float = 0.0
+    top_p: float = 0.85
+    frequency_penalty: float = 0.2
+    extra_body: dict[str, Any] = field(
+        default_factory=lambda: {"skip_special_tokens": False}
+    )
+
+
+@dataclass
+class ModelResponse:
+    """Response from the AI model."""
+
+    thinking: str
+    action: str
+    raw_content: str
+
+
+class ModelClient:
+    """
+    Client for interacting with OpenAI-compatible vision-language models.
+
+    Args:
+        config: Model configuration.
+    """
+
+    def __init__(self, config: ModelConfig | None = None):
+        self.config = config or ModelConfig()
+        self.client = OpenAI(base_url=self.config.base_url, api_key=self.config.api_key)
+
+    def request(self, messages: list[dict[str, Any]]) -> ModelResponse:
+        """
+        Send a request to the model.
+
+        Args:
+            messages: List of message dictionaries in OpenAI format.
+
+        Returns:
+            ModelResponse containing thinking and action.
+
+        Raises:
+            ValueError: If the response cannot be parsed.
+        """
+        response = self.client.chat.completions.create(
+            messages=messages,
+            model=self.config.model_name,
+            max_tokens=self.config.max_tokens,
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            frequency_penalty=self.config.frequency_penalty,
+            extra_body=self.config.extra_body,
+        )
+
+        raw_content = response.choices[0].message.content
+
+        # Parse thinking and action from response
+        thinking, action = self._parse_response(raw_content)
+
+        return ModelResponse(thinking=thinking, action=action, raw_content=raw_content)
+
+    def _parse_response(self, content: str) -> tuple[str, str]:
+        """
+        Parse the model response into thinking and action parts.
+
+        Args:
+            content: Raw response content.
+
+        Returns:
+            Tuple of (thinking, action).
+        """
+        if "<answer>" not in content:
+            return "", content
+
+        parts = content.split("<answer>", 1)
+        thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
+        action = parts[1].replace("</answer>", "").strip()
+
+        return thinking, action
+
+
+class MessageBuilder:
+    """Helper class for building conversation messages."""
+
+    @staticmethod
+    def create_system_message(content: str) -> dict[str, Any]:
+        """Create a system message."""
+        return {"role": "system", "content": content}
+
+    @staticmethod
+    def create_user_message(
+        text: str, image_base64: str | None = None
+    ) -> dict[str, Any]:
+        """
+        Create a user message with optional image.
+
+        Args:
+            text: Text content.
+            image_base64: Optional base64-encoded image.
+
+        Returns:
+            Message dictionary.
+        """
+        content = []
+
+        if image_base64:
+            content.append(
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                }
+            )
+
+        content.append({"type": "text", "text": text})
+
+        return {"role": "user", "content": content}
+
+    @staticmethod
+    def create_assistant_message(content: str) -> dict[str, Any]:
+        """Create an assistant message."""
+        return {"role": "assistant", "content": content}
+
+    @staticmethod
+    def remove_images_from_message(message: dict[str, Any]) -> dict[str, Any]:
+        """
+        Remove image content from a message to save context space.
+
+        Args:
+            message: Message dictionary.
+
+        Returns:
+            Message with images removed.
+        """
+        if isinstance(message.get("content"), list):
+            message["content"] = [
+                item for item in message["content"] if item.get("type") == "text"
+            ]
+        return message
+
+    @staticmethod
+    def build_screen_info(current_app: str, **extra_info) -> str:
+        """
+        Build screen info string for the model.
+
+        Args:
+            current_app: Current app name.
+            **extra_info: Additional info to include.
+
+        Returns:
+            JSON string with screen info.
+        """
+        info = {"current_app": current_app, **extra_info}
+        return json.dumps(info, ensure_ascii=False)