From b97d3f3a9f7ecd3854b543e8101435176b43ec95 Mon Sep 17 00:00:00 2001
From: "let5sne.win10" <let5sne.win10.pc@gmail.com>
Date: Sat, 10 Jan 2026 01:47:09 +0800
Subject: [PATCH] Improve Video Learning Agent with action-based detection and
 analysis toggle

- Change video detection from screenshot hash to action-based (Swipe detection)
- Add enable_analysis toggle to disable VLM screenshot analysis
- Improve task prompt to prevent VLM from stopping prematurely
- Add debug logging for action detection troubleshooting
- Fix ModelResponse attribute error (content -> raw_content)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 dashboard/api/video_learning.py       | 72 +++++++++++---------
 dashboard/static/js/video-learning.js |  2 +
 dashboard/static/video-learning.html  | 17 ++++-
 phone_agent/video_learning.py         | 97 ++++++++++++++++-----------
 4 files changed, 116 insertions(+), 72 deletions(-)

diff --git a/dashboard/api/video_learning.py b/dashboard/api/video_learning.py
index 71e138c..f235bee 100644
--- a/dashboard/api/video_learning.py
+++ b/dashboard/api/video_learning.py
@@ -26,6 +26,7 @@ class SessionCreateRequest(BaseModel):
     target_count: int = Field(10, description="Number of videos to watch", ge=1, le=100)
     category: Optional[str] = Field(None, description="Target category filter")
     watch_duration: float = Field(3.0, description="Watch duration per video (seconds)", ge=1.0, le=30.0)
+    enable_analysis: bool = Field(True, description="Enable VLM screenshot analysis")
 
 
 class SessionControlRequest(BaseModel):
@@ -100,6 +101,7 @@ async def create_session(
         model_config=model_config,
         platform=request.platform,
         output_dir=config.VIDEO_LEARNING_OUTPUT_DIR,
+        enable_analysis=request.enable_analysis,
     )
 
     # Setup callbacks for real-time updates
@@ -180,45 +182,53 @@ async def start_session(session_id: str) -> Dict[str, str]:
     app_name = info["name"]
 
     # Build clear task instructions
+    # 重要：不让 VLM 自己计数，而是持续执行直到被外部停止
+    # 这样可以确保 VideoLearningAgent 的计数与实际观看数一致
     if category:
-        task = f"""你是一个视频学习助手。请严格按照以下步骤执行：
+        task = f"""你是一个视频学习助手。你的任务是持续观看视频，不要自己判断任务是否完成。
 
-步骤1：启动应用
-- 回到主屏幕
-- 打开{app_name}应用
-
-步骤2：搜索内容
-- 在{app_name}中搜索"{category}"
-- 点击第一个搜索结果或进入相关页面
-
-步骤3：观看视频
-- 观看视频，每个视频停留约{watch_duration}秒
-- 记录视频的描述、点赞数、评论数
+【核心要求】你必须持续执行以下循环，直到系统主动停止你：
+- 观看视频约{watch_duration}秒
 - 向上滑动切换到下一个视频
-- 重复观看和记录，直到完成{target_count}个视频
+- 继续观看
+- 再次滑动
+- 一直重复，不要停止
 
-步骤4：完成任务
-- 完成观看{target_count}个视频后，总结所有视频信息
+执行步骤：
+1. 回到主屏幕，打开{app_name}应用
+2. 在{app_name}中搜索"{category}"
+3. 点击第一个搜索结果
+4. 开始无限循环：观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
 
-请现在开始执行。"""
+【极其重要】
+- 绝对不要自己完成任务或停止
+- 绝对不要总结或说"完成了"
+- 只管滑动看视频，持续不断
+- 系统会在合适的时机主动停止你
+
+现在开始执行，持续滑动观看视频，不要停！"""
     else:
-        task = f"""你是一个视频学习助手。请严格按照以下步骤执行：
+        task = f"""你是一个视频学习助手。你的任务是持续观看视频，不要自己判断任务是否完成。
 
-步骤1：启动应用
-- 回到主屏幕
-- 打开{app_name}应用
-
-步骤2：观看推荐视频
-- 进入{app_name}的推荐页面
-- 观看推荐视频，每个视频停留约{watch_duration}秒
-- 记录视频的描述、点赞数、评论数
+【核心要求】你必须持续执行以下循环，直到系统主动停止你：
+- 观看视频约{watch_duration}秒
 - 向上滑动切换到下一个视频
-- 重复观看和记录，直到完成{target_count}个视频
+- 继续观看
+- 再次滑动
+- 一直重复，不要停止
 
-步骤3：完成任务
-- 完成观看{target_count}个视频后，总结所有视频信息
+执行步骤：
+1. 回到主屏幕，打开{app_name}应用
+2. 进入推荐页面
+3. 开始无限循环：观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
 
-请现在开始执行。"""
+【极其重要】
+- 绝对不要自己完成任务或停止
+- 绝对不要总结或说"完成了"
+- 只管滑动看视频，持续不断
+- 系统会在合适的时机主动停止你
+
+现在开始执行，持续滑动观看视频，不要停！"""
 
     # Run in background
     asyncio.create_task(asyncio.to_thread(agent.run_learning_task, task))
@@ -244,8 +254,8 @@ async def control_session(
         return {"session_id": session_id, "status": "resumed"}
     elif request.action == "stop":
         agent.stop_session()
-        # Remove from active sessions
-        del _active_sessions[session_id]
+        # Don't delete immediately - let status queries still work
+        # Session will be cleaned up when is_active becomes False
         return {"session_id": session_id, "status": "stopped"}
     else:
         raise HTTPException(status_code=400, detail=f"Invalid action: {request.action}")
diff --git a/dashboard/static/js/video-learning.js b/dashboard/static/js/video-learning.js
index 7c26799..88aeb51 100644
--- a/dashboard/static/js/video-learning.js
+++ b/dashboard/static/js/video-learning.js
@@ -19,6 +19,7 @@ const VideoLearningModule = {
             targetCount = 10,
             category = null,
             watchDuration = 3.0,
+            enableAnalysis = true,
         } = options;
 
         try {
@@ -28,6 +29,7 @@ const VideoLearningModule = {
                 target_count: targetCount,
                 category: category,
                 watch_duration: watchDuration,
+                enable_analysis: enableAnalysis,
             });
 
             this.currentSessionId = response.data.session_id;
diff --git a/dashboard/static/video-learning.html b/dashboard/static/video-learning.html
index d04215f..cbd19e3 100644
--- a/dashboard/static/video-learning.html
+++ b/dashboard/static/video-learning.html
@@ -97,6 +97,14 @@
                         <small>Leave empty to watch recommended videos</small>
                     </div>
 
+                    <div class="form-group checkbox-group">
+                        <label>
+                            <input type="checkbox" v-model="config.enableAnalysis" :disabled="loading">
+                            <span>Enable Screenshot Analysis</span>
+                        </label>
+                        <small>Analyze video content using VLM to extract description, likes, comments, tags, etc.</small>
+                    </div>
+
                     <button @click="createAndStartSession" class="btn btn-primary" :disabled="loading || !config.deviceId">
                         <svg v-if="loading" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="spinning">
                             <path d="M21 12a9 9 0 1 1-6.219-8.56"></path>
@@ -269,6 +277,7 @@
                         targetCount: 10,
                         category: '',
                         watchDuration: 3.0,
+                        enableAnalysis: true,
                     },
                 };
             },
@@ -304,6 +313,7 @@
                                 targetCount: this.config.targetCount,
                                 category: this.config.category || null,
                                 watchDuration: this.config.watchDuration,
+                                enableAnalysis: this.config.enableAnalysis,
                             }
                         );
 
@@ -354,7 +364,12 @@
 
                     try {
                         await VideoLearningModule.controlSession(this.currentSessionId, 'stop');
-                        await this.updateSessionStatus();
+                        // Stop polling first
+                        VideoLearningModule.stopPolling();
+                        // Mark session as stopped in UI
+                        if (this.sessionStatus) {
+                            this.sessionStatus.is_active = false;
+                        }
                         this.showToast('Session stopped', 'info');
                     } catch (error) {
                         this.showToast('Failed to stop session', 'error');
diff --git a/phone_agent/video_learning.py b/phone_agent/video_learning.py
index 18891f2..de67750 100644
--- a/phone_agent/video_learning.py
+++ b/phone_agent/video_learning.py
@@ -132,7 +132,8 @@ class ScreenshotAnalyzer:
         try:
             # 调用 VLM
             response = self.model_client.request(messages)
-            result_text = response.content.strip()
+            # ModelResponse 使用 raw_content 而不是 content
+            result_text = response.raw_content.strip()
 
             # 解析 JSON
             return self._parse_result(result_text)
@@ -209,6 +210,7 @@ class VideoLearningAgent:
         model_config: ModelConfig,
         platform: str = "douyin",
         output_dir: str = "./video_learning_data",
+        enable_analysis: bool = True,
     ):
         """
         Initialize Video Learning Agent.
@@ -217,11 +219,13 @@ class VideoLearningAgent:
             model_config: Model configuration for VLM
             platform: Platform name (douyin, kuaishou, tiktok)
             output_dir: Directory to save screenshots and data
+            enable_analysis: Whether to enable VLM screenshot analysis
         """
         self.model_config = model_config
         self.platform = platform
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.enable_analysis = enable_analysis  # 画面分析开关
 
         # Create screenshots subdirectory
         self.screenshot_dir = self.output_dir / "screenshots"
@@ -239,20 +243,23 @@ class VideoLearningAgent:
         self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
         self.on_progress_update: Optional[Callable[[int, int], None]] = None
 
-        # Video detection: track screenshot changes (simplified)
-        self._last_screenshot_hash: Optional[str] = None
+        # Video detection: 基于动作检测
+        self._first_video_recorded: bool = False  # 是否已记录首视频
 
         # Skip app startup screens
         self._in_app_steps: int = 0
         self._warmup_steps: int = 3  # Skip first 3 steps after entering app
 
-        # Screenshot analyzer for content extraction
+        # Screenshot analyzer for content extraction (only if enabled)
         self._analyzer: Optional[ScreenshotAnalyzer] = None
-        try:
-            self._analyzer = ScreenshotAnalyzer(model_config)
-            print("[VideoLearning] Screenshot analyzer initialized")
-        except Exception as e:
-            print(f"[VideoLearning] Analyzer init failed: {e}")
+        if self.enable_analysis:
+            try:
+                self._analyzer = ScreenshotAnalyzer(model_config)
+                print("[VideoLearning] Screenshot analyzer initialized")
+            except Exception as e:
+                print(f"[VideoLearning] Analyzer init failed: {e}")
+        else:
+            print("[VideoLearning] Screenshot analysis disabled")
 
     def start_session(
         self,
@@ -360,10 +367,11 @@ class VideoLearningAgent:
         """
         Callback after each step.
 
-        Simplified logic:
-        1. Check if we're in the target app using get_current_app()
-        2. Detect screenshot changes
-        3. Record video when screenshot changes
+        基于动作检测的逻辑：
+        1. 检测是否在目标 APP 中
+        2. Warmup 阶段跳过
+        3. Warmup 结束后记录首视频
+        4. 检测滑动动作，滑动后记录新视频
 
         Args:
             result: Step execution result
@@ -409,8 +417,9 @@ class VideoLearningAgent:
             is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)
 
             if not is_in_target:
-                # Reset warmup counter when leaving app
+                # Reset counters when leaving app
                 self._in_app_steps = 0
+                self._first_video_recorded = False
                 print(f"[VideoLearning] Not in target app: {current_app}")
                 return None
 
@@ -420,44 +429,52 @@ class VideoLearningAgent:
                 print(f"[VideoLearning] Warmup step {self._in_app_steps}/{self._warmup_steps}, skipping...")
                 return None
 
-            # Get screenshot
+            # 获取截图用于记录
             screenshot = get_device_factory().get_screenshot(self._device_id)
 
-            # Use full base64 data for hash (more sensitive)
-            current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
+            # 首视频记录：warmup 结束后立即记录第一个视频
+            if not self._first_video_recorded:
+                self._first_video_recorded = True
+                self._record_video_from_screenshot(screenshot)
+                print(f"[VideoLearning] ✓ Recorded first video {self.video_counter}/{self.current_session.target_count}")
+                return self._check_target_reached()
 
-            # Detect screenshot change and record video
-            if self._last_screenshot_hash is None:
-                # First screenshot in target app - record first video
-                self._last_screenshot_hash = current_hash
+            # 基于动作检测：检测滑动动作
+            action = result.action
+            action_type = action.get("action") if action else None
+
+            # 调试日志：打印当前动作
+            if action_type:
+                print(f"[VideoLearning] Current action: {action_type}")
+
+            # 检查滑动动作（忽略大小写）
+            if action and action_type and action_type.lower() == "swipe":
+                # VLM 执行了滑动，记录新视频
+                print(f"[VideoLearning] Detected swipe action, recording new video...")
                 self._record_video_from_screenshot(screenshot)
                 print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
+                return self._check_target_reached()
 
-                # Check if we've reached target after recording
-                if self.video_counter >= self.current_session.target_count:
-                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
-                    self.current_session.is_active = False
-                    self._save_session()
-                    return "stop"
-
-            elif current_hash != self._last_screenshot_hash:
-                # Screenshot changed - record new video
-                self._last_screenshot_hash = current_hash
-                self._record_video_from_screenshot(screenshot)
-                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
-
-                # Check if we've reached target after recording
-                if self.video_counter >= self.current_session.target_count:
-                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
-                    self.current_session.is_active = False
-                    self._save_session()
-                    return "stop"
+            # 如果不是滑动动作，打印提示
+            if action_type and action_type.lower() != "swipe":
+                print(f"[VideoLearning] Non-swipe action detected ({action_type}), waiting for swipe...")
 
         except Exception as e:
             print(f"[VideoLearning] Warning: {e}")
+            import traceback
+            traceback.print_exc()
 
         return None
 
+    def _check_target_reached(self) -> Optional[str]:
+        """检查是否达到目标数量"""
+        if self.video_counter >= self.current_session.target_count:
+            print(f"[VideoLearning] ✓ Target reached! Stopping...")
+            self.current_session.is_active = False
+            self._save_session()
+            return "stop"
+        return None
+
     def _record_video_from_screenshot(self, screenshot):
         """Helper method to record video from screenshot with analysis."""
         import base64