Improve Video Learning Agent with action-based detection and analysis toggle
- Change video detection from screenshot hash to action-based (Swipe detection) - Add enable_analysis toggle to disable VLM screenshot analysis - Improve task prompt to prevent VLM from stopping prematurely - Add debug logging for action detection troubleshooting - Fix ModelResponse attribute error (content -> raw_content) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -132,7 +132,8 @@ class ScreenshotAnalyzer:
|
||||
try:
|
||||
# 调用 VLM
|
||||
response = self.model_client.request(messages)
|
||||
result_text = response.content.strip()
|
||||
# ModelResponse 使用 raw_content 而不是 content
|
||||
result_text = response.raw_content.strip()
|
||||
|
||||
# 解析 JSON
|
||||
return self._parse_result(result_text)
|
||||
@@ -209,6 +210,7 @@ class VideoLearningAgent:
|
||||
model_config: ModelConfig,
|
||||
platform: str = "douyin",
|
||||
output_dir: str = "./video_learning_data",
|
||||
enable_analysis: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize Video Learning Agent.
|
||||
@@ -217,11 +219,13 @@ class VideoLearningAgent:
|
||||
model_config: Model configuration for VLM
|
||||
platform: Platform name (douyin, kuaishou, tiktok)
|
||||
output_dir: Directory to save screenshots and data
|
||||
enable_analysis: Whether to enable VLM screenshot analysis
|
||||
"""
|
||||
self.model_config = model_config
|
||||
self.platform = platform
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.enable_analysis = enable_analysis # 画面分析开关
|
||||
|
||||
# Create screenshots subdirectory
|
||||
self.screenshot_dir = self.output_dir / "screenshots"
|
||||
@@ -239,20 +243,23 @@ class VideoLearningAgent:
|
||||
self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
|
||||
self.on_progress_update: Optional[Callable[[int, int], None]] = None
|
||||
|
||||
# Video detection: track screenshot changes (simplified)
|
||||
self._last_screenshot_hash: Optional[str] = None
|
||||
# Video detection: 基于动作检测
|
||||
self._first_video_recorded: bool = False # 是否已记录首视频
|
||||
|
||||
# Skip app startup screens
|
||||
self._in_app_steps: int = 0
|
||||
self._warmup_steps: int = 3 # Skip first 3 steps after entering app
|
||||
|
||||
# Screenshot analyzer for content extraction
|
||||
# Screenshot analyzer for content extraction (only if enabled)
|
||||
self._analyzer: Optional[ScreenshotAnalyzer] = None
|
||||
try:
|
||||
self._analyzer = ScreenshotAnalyzer(model_config)
|
||||
print("[VideoLearning] Screenshot analyzer initialized")
|
||||
except Exception as e:
|
||||
print(f"[VideoLearning] Analyzer init failed: {e}")
|
||||
if self.enable_analysis:
|
||||
try:
|
||||
self._analyzer = ScreenshotAnalyzer(model_config)
|
||||
print("[VideoLearning] Screenshot analyzer initialized")
|
||||
except Exception as e:
|
||||
print(f"[VideoLearning] Analyzer init failed: {e}")
|
||||
else:
|
||||
print("[VideoLearning] Screenshot analysis disabled")
|
||||
|
||||
def start_session(
|
||||
self,
|
||||
@@ -360,10 +367,11 @@ class VideoLearningAgent:
|
||||
"""
|
||||
Callback after each step.
|
||||
|
||||
Simplified logic:
|
||||
1. Check if we're in the target app using get_current_app()
|
||||
2. Detect screenshot changes
|
||||
3. Record video when screenshot changes
|
||||
基于动作检测的逻辑:
|
||||
1. 检测是否在目标 APP 中
|
||||
2. Warmup 阶段跳过
|
||||
3. Warmup 结束后记录首视频
|
||||
4. 检测滑动动作,滑动后记录新视频
|
||||
|
||||
Args:
|
||||
result: Step execution result
|
||||
@@ -409,8 +417,9 @@ class VideoLearningAgent:
|
||||
is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)
|
||||
|
||||
if not is_in_target:
|
||||
# Reset warmup counter when leaving app
|
||||
# Reset counters when leaving app
|
||||
self._in_app_steps = 0
|
||||
self._first_video_recorded = False
|
||||
print(f"[VideoLearning] Not in target app: {current_app}")
|
||||
return None
|
||||
|
||||
@@ -420,44 +429,52 @@ class VideoLearningAgent:
|
||||
print(f"[VideoLearning] Warmup step {self._in_app_steps}/{self._warmup_steps}, skipping...")
|
||||
return None
|
||||
|
||||
# Get screenshot
|
||||
# 获取截图用于记录
|
||||
screenshot = get_device_factory().get_screenshot(self._device_id)
|
||||
|
||||
# Use full base64 data for hash (more sensitive)
|
||||
current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
|
||||
# 首视频记录:warmup 结束后立即记录第一个视频
|
||||
if not self._first_video_recorded:
|
||||
self._first_video_recorded = True
|
||||
self._record_video_from_screenshot(screenshot)
|
||||
print(f"[VideoLearning] ✓ Recorded first video {self.video_counter}/{self.current_session.target_count}")
|
||||
return self._check_target_reached()
|
||||
|
||||
# Detect screenshot change and record video
|
||||
if self._last_screenshot_hash is None:
|
||||
# First screenshot in target app - record first video
|
||||
self._last_screenshot_hash = current_hash
|
||||
# 基于动作检测:检测滑动动作
|
||||
action = result.action
|
||||
action_type = action.get("action") if action else None
|
||||
|
||||
# 调试日志:打印当前动作
|
||||
if action_type:
|
||||
print(f"[VideoLearning] Current action: {action_type}")
|
||||
|
||||
# 检查滑动动作(忽略大小写)
|
||||
if action and action_type and action_type.lower() == "swipe":
|
||||
# VLM 执行了滑动,记录新视频
|
||||
print(f"[VideoLearning] Detected swipe action, recording new video...")
|
||||
self._record_video_from_screenshot(screenshot)
|
||||
print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
|
||||
return self._check_target_reached()
|
||||
|
||||
# Check if we've reached target after recording
|
||||
if self.video_counter >= self.current_session.target_count:
|
||||
print(f"[VideoLearning] ✓ Target reached! Stopping...")
|
||||
self.current_session.is_active = False
|
||||
self._save_session()
|
||||
return "stop"
|
||||
|
||||
elif current_hash != self._last_screenshot_hash:
|
||||
# Screenshot changed - record new video
|
||||
self._last_screenshot_hash = current_hash
|
||||
self._record_video_from_screenshot(screenshot)
|
||||
print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
|
||||
|
||||
# Check if we've reached target after recording
|
||||
if self.video_counter >= self.current_session.target_count:
|
||||
print(f"[VideoLearning] ✓ Target reached! Stopping...")
|
||||
self.current_session.is_active = False
|
||||
self._save_session()
|
||||
return "stop"
|
||||
# 如果不是滑动动作,打印提示
|
||||
if action_type and action_type.lower() != "swipe":
|
||||
print(f"[VideoLearning] Non-swipe action detected ({action_type}), waiting for swipe...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[VideoLearning] Warning: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return None
|
||||
|
||||
def _check_target_reached(self) -> Optional[str]:
|
||||
"""检查是否达到目标数量"""
|
||||
if self.video_counter >= self.current_session.target_count:
|
||||
print(f"[VideoLearning] ✓ Target reached! Stopping...")
|
||||
self.current_session.is_active = False
|
||||
self._save_session()
|
||||
return "stop"
|
||||
return None
|
||||
|
||||
def _record_video_from_screenshot(self, screenshot):
|
||||
"""Helper method to record video from screenshot with analysis."""
|
||||
import base64
|
||||
|
||||
Reference in New Issue
Block a user