From b1ddd985529dea8eb387a30ac5b0da085bd3b811 Mon Sep 17 00:00:00 2001 From: liuyongbin Date: Sun, 14 Dec 2025 14:03:41 +0800 Subject: [PATCH] fix format --- main.py | 1 + phone_agent/actions/handler.py | 2 +- phone_agent/config/i18n.py | 8 +++++ phone_agent/model/client.py | 55 +++++++++++++++++++++++++++++++++- scripts/check_deployment_en.py | 24 +++++++++++---- 5 files changed, 83 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 6022251..39942fb 100644 --- a/main.py +++ b/main.py @@ -476,6 +476,7 @@ def main(): base_url=args.base_url, model_name=args.model, api_key=args.apikey, + lang=args.lang, ) agent_config = AgentConfig( diff --git a/phone_agent/actions/handler.py b/phone_agent/actions/handler.py index e856c95..5a0b0d9 100644 --- a/phone_agent/actions/handler.py +++ b/phone_agent/actions/handler.py @@ -285,7 +285,7 @@ def parse_action(response: str) -> dict[str, Any]: if response.startswith("do"): # Use AST parsing instead of eval for safety try: - tree = ast.parse(response, mode='eval') + tree = ast.parse(response, mode="eval") if not isinstance(tree.body, ast.Call): raise ValueError("Expected a function call") diff --git a/phone_agent/config/i18n.py b/phone_agent/config/i18n.py index a5070ee..3966022 100644 --- a/phone_agent/config/i18n.py +++ b/phone_agent/config/i18n.py @@ -19,6 +19,10 @@ MESSAGES_ZH = { "step": "步骤", "task": "任务", "result": "结果", + "performance_metrics": "性能指标", + "time_to_first_token": "首 Token 延迟 (TTFT)", + "time_to_thinking_end": "思考完成延迟", + "total_inference_time": "总推理时间", } # English messages @@ -40,6 +44,10 @@ MESSAGES_EN = { "step": "Step", "task": "Task", "result": "Result", + "performance_metrics": "Performance Metrics", + "time_to_first_token": "Time to First Token (TTFT)", + "time_to_thinking_end": "Time to Thinking End", + "total_inference_time": "Total Inference Time", } diff --git a/phone_agent/model/client.py b/phone_agent/model/client.py index ccf77ea..72377a6 100644 --- a/phone_agent/model/client.py +++ b/phone_agent/model/client.py @@ -1,11 +1,14 @@ """Model client for AI inference using OpenAI-compatible API.""" import json +import time from dataclasses import dataclass, field from typing import Any from openai import OpenAI +from phone_agent.config.i18n import get_message + @dataclass class ModelConfig: @@ -19,6 +22,7 @@ class ModelConfig: top_p: float = 0.85 frequency_penalty: float = 0.2 extra_body: dict[str, Any] = field(default_factory=dict) + lang: str = "cn" # Language for UI messages: 'cn' or 'en' @dataclass @@ -28,6 +32,10 @@ class ModelResponse: thinking: str action: str raw_content: str + # Performance metrics + time_to_first_token: float | None = None # Time to first token (seconds) + time_to_thinking_end: float | None = None # Time to thinking end (seconds) + total_time: float | None = None # Total inference time (seconds) class ModelClient: @@ -55,6 +63,11 @@ class ModelClient: Raises: ValueError: If the response cannot be parsed. """ + # Start timing + start_time = time.time() + time_to_first_token = None + time_to_thinking_end = None + stream = self.client.chat.completions.create( messages=messages, model=self.config.model_name, @@ -70,6 +83,7 @@ class ModelClient: buffer = "" # Buffer to hold content that might be part of a marker action_markers = ["finish(message=", "do(action="] in_action_phase = False # Track if we've entered the action phase + first_token_received = False for chunk in stream: if len(chunk.choices) == 0: @@ -78,6 +92,11 @@ class ModelClient: content = chunk.choices[0].delta.content raw_content += content + # Record time to first token + if not first_token_received: + time_to_first_token = time.time() - start_time + first_token_received = True + if in_action_phase: # Already in action phase, just accumulate content without printing continue @@ -94,6 +113,11 @@ class ModelClient: print() # Print newline after thinking is complete in_action_phase = True marker_found = True + + # Record time to thinking end + if time_to_thinking_end is None: + time_to_thinking_end = time.time() - start_time + break if marker_found: @@ -115,10 +139,39 @@ class ModelClient: print(buffer, end="", flush=True) buffer = "" + # Calculate total time + total_time = time.time() - start_time + # Parse thinking and action from response thinking, action = self._parse_response(raw_content) - return ModelResponse(thinking=thinking, action=action, raw_content=raw_content) + # Print performance metrics + lang = self.config.lang + print() + print("=" * 50) + print(f"⏱️ {get_message('performance_metrics', lang)}:") + print("-" * 50) + if time_to_first_token is not None: + print( + f"{get_message('time_to_first_token', lang)}: {time_to_first_token:.3f}s" + ) + if time_to_thinking_end is not None: + print( + f"{get_message('time_to_thinking_end', lang)}: {time_to_thinking_end:.3f}s" + ) + print( + f"{get_message('total_inference_time', lang)}: {total_time:.3f}s" + ) + print("=" * 50) + + return ModelResponse( + thinking=thinking, + action=action, + raw_content=raw_content, + time_to_first_token=time_to_first_token, + time_to_thinking_end=time_to_thinking_end, + total_time=total_time, + ) def _parse_response(self, content: str) -> tuple[str, str]: """ diff --git a/scripts/check_deployment_en.py b/scripts/check_deployment_en.py index 808ed89..06e94ec 100644 --- a/scripts/check_deployment_en.py +++ b/scripts/check_deployment_en.py @@ -41,19 +41,31 @@ Usage examples: ) parser.add_argument( - "--max-tokens", type=int, default=3000, help="Maximum generation tokens (default: 3000)" + "--max-tokens", + type=int, + default=3000, + help="Maximum generation tokens (default: 3000)", ) parser.add_argument( - "--temperature", type=float, default=0.0, help="Sampling temperature (default: 0.0)" + "--temperature", + type=float, + default=0.0, + help="Sampling temperature (default: 0.0)", ) parser.add_argument( - "--top_p", type=float, default=0.85, help="Nucleus sampling parameter (default: 0.85)" + "--top_p", + type=float, + default=0.85, + help="Nucleus sampling parameter (default: 0.85)", ) parser.add_argument( - "--frequency_penalty", type=float, default=0.2, help="Frequency penalty parameter (default: 0.2)" + "--frequency_penalty", + type=float, + default=0.2, + help="Frequency penalty parameter (default: 0.2)", ) args = parser.parse_args() @@ -103,7 +115,9 @@ Usage examples: print(f" - Completion tokens: {response.usage.completion_tokens}") print(f" - Total tokens: {response.usage.total_tokens}") - print(f"\nPlease evaluate the above inference result to determine if the model deployment meets expectations.") + print( + f"\nPlease evaluate the above inference result to determine if the model deployment meets expectations." + ) except Exception as e: print(f"\nError occurred while calling API:")