Improve Video Learning Agent with action-based detection and analysis toggle

- Change video detection from screenshot hash to action-based (Swipe detection) - Add enable_analysis toggle to disable VLM screenshot analysis - Improve task prompt to prevent VLM from stopping prematurely - Add debug logging for action detection troubleshooting - Fix ModelResponse attribute error (content -> raw_content) Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-10 01:47:09 +08:00
parent 6b770832aa
commit b97d3f3a9f
4 changed files with 116 additions and 72 deletions
--- a/phone_agent/video_learning.py
+++ b/phone_agent/video_learning.py
@@ -132,7 +132,8 @@ class ScreenshotAnalyzer:
        try:
            # 调用 VLM
            response = self.model_client.request(messages)
-            result_text = response.content.strip()
+            # ModelResponse 使用 raw_content 而不是 content
+            result_text = response.raw_content.strip()

            # 解析 JSON
            return self._parse_result(result_text)
@@ -209,6 +210,7 @@ class VideoLearningAgent:
        model_config: ModelConfig,
        platform: str = "douyin",
        output_dir: str = "./video_learning_data",
+        enable_analysis: bool = True,
    ):
        """
        Initialize Video Learning Agent.
@@ -217,11 +219,13 @@ class VideoLearningAgent:
            model_config: Model configuration for VLM
            platform: Platform name (douyin, kuaishou, tiktok)
            output_dir: Directory to save screenshots and data
+            enable_analysis: Whether to enable VLM screenshot analysis
        """
        self.model_config = model_config
        self.platform = platform
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.enable_analysis = enable_analysis  # 画面分析开关

        # Create screenshots subdirectory
        self.screenshot_dir = self.output_dir / "screenshots"
@@ -239,20 +243,23 @@ class VideoLearningAgent:
        self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
        self.on_progress_update: Optional[Callable[[int, int], None]] = None

-        # Video detection: track screenshot changes (simplified)
-        self._last_screenshot_hash: Optional[str] = None
+        # Video detection: 基于动作检测
+        self._first_video_recorded: bool = False  # 是否已记录首视频

        # Skip app startup screens
        self._in_app_steps: int = 0
        self._warmup_steps: int = 3  # Skip first 3 steps after entering app

-        # Screenshot analyzer for content extraction
+        # Screenshot analyzer for content extraction (only if enabled)
        self._analyzer: Optional[ScreenshotAnalyzer] = None
-        try:
-            self._analyzer = ScreenshotAnalyzer(model_config)
-            print("[VideoLearning] Screenshot analyzer initialized")
-        except Exception as e:
-            print(f"[VideoLearning] Analyzer init failed: {e}")
+        if self.enable_analysis:
+            try:
+                self._analyzer = ScreenshotAnalyzer(model_config)
+                print("[VideoLearning] Screenshot analyzer initialized")
+            except Exception as e:
+                print(f"[VideoLearning] Analyzer init failed: {e}")
+        else:
+            print("[VideoLearning] Screenshot analysis disabled")

    def start_session(
        self,
@@ -360,10 +367,11 @@ class VideoLearningAgent:
        """
        Callback after each step.

-        Simplified logic:
-        1. Check if we're in the target app using get_current_app()
-        2. Detect screenshot changes
-        3. Record video when screenshot changes
+        基于动作检测的逻辑：
+        1. 检测是否在目标 APP 中
+        2. Warmup 阶段跳过
+        3. Warmup 结束后记录首视频
+        4. 检测滑动动作，滑动后记录新视频

        Args:
            result: Step execution result
@@ -409,8 +417,9 @@ class VideoLearningAgent:
            is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)

            if not is_in_target:
-                # Reset warmup counter when leaving app
+                # Reset counters when leaving app
                self._in_app_steps = 0
+                self._first_video_recorded = False
                print(f"[VideoLearning] Not in target app: {current_app}")
                return None

@@ -420,44 +429,52 @@ class VideoLearningAgent:
                print(f"[VideoLearning] Warmup step {self._in_app_steps}/{self._warmup_steps}, skipping...")
                return None

-            # Get screenshot
+            # 获取截图用于记录
            screenshot = get_device_factory().get_screenshot(self._device_id)

-            # Use full base64 data for hash (more sensitive)
-            current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
+            # 首视频记录：warmup 结束后立即记录第一个视频
+            if not self._first_video_recorded:
+                self._first_video_recorded = True
+                self._record_video_from_screenshot(screenshot)
+                print(f"[VideoLearning] ✓ Recorded first video {self.video_counter}/{self.current_session.target_count}")
+                return self._check_target_reached()

-            # Detect screenshot change and record video
-            if self._last_screenshot_hash is None:
-                # First screenshot in target app - record first video
-                self._last_screenshot_hash = current_hash
+            # 基于动作检测：检测滑动动作
+            action = result.action
+            action_type = action.get("action") if action else None
+
+            # 调试日志：打印当前动作
+            if action_type:
+                print(f"[VideoLearning] Current action: {action_type}")
+
+            # 检查滑动动作（忽略大小写）
+            if action and action_type and action_type.lower() == "swipe":
+                # VLM 执行了滑动，记录新视频
+                print(f"[VideoLearning] Detected swipe action, recording new video...")
                self._record_video_from_screenshot(screenshot)
                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
+                return self._check_target_reached()

-                # Check if we've reached target after recording
-                if self.video_counter >= self.current_session.target_count:
-                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
-                    self.current_session.is_active = False
-                    self._save_session()
-                    return "stop"
-
-            elif current_hash != self._last_screenshot_hash:
-                # Screenshot changed - record new video
-                self._last_screenshot_hash = current_hash
-                self._record_video_from_screenshot(screenshot)
-                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
-
-                # Check if we've reached target after recording
-                if self.video_counter >= self.current_session.target_count:
-                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
-                    self.current_session.is_active = False
-                    self._save_session()
-                    return "stop"
+            # 如果不是滑动动作，打印提示
+            if action_type and action_type.lower() != "swipe":
+                print(f"[VideoLearning] Non-swipe action detected ({action_type}), waiting for swipe...")

        except Exception as e:
            print(f"[VideoLearning] Warning: {e}")
+            import traceback
+            traceback.print_exc()

        return None

+    def _check_target_reached(self) -> Optional[str]:
+        """检查是否达到目标数量"""
+        if self.video_counter >= self.current_session.target_count:
+            print(f"[VideoLearning] ✓ Target reached! Stopping...")
+            self.current_session.is_active = False
+            self._save_session()
+            return "stop"
+        return None
+
    def _record_video_from_screenshot(self, screenshot):
        """Helper method to record video from screenshot with analysis."""
        import base64