Improve Video Learning Agent with action-based detection and analysis toggle
- Change video detection from screenshot hash to action-based (Swipe detection) - Add enable_analysis toggle to disable VLM screenshot analysis - Improve task prompt to prevent VLM from stopping prematurely - Add debug logging for action detection troubleshooting - Fix ModelResponse attribute error (content -> raw_content) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,7 @@ class SessionCreateRequest(BaseModel):
|
||||
target_count: int = Field(10, description="Number of videos to watch", ge=1, le=100)
|
||||
category: Optional[str] = Field(None, description="Target category filter")
|
||||
watch_duration: float = Field(3.0, description="Watch duration per video (seconds)", ge=1.0, le=30.0)
|
||||
enable_analysis: bool = Field(True, description="Enable VLM screenshot analysis")
|
||||
|
||||
|
||||
class SessionControlRequest(BaseModel):
|
||||
@@ -100,6 +101,7 @@ async def create_session(
|
||||
model_config=model_config,
|
||||
platform=request.platform,
|
||||
output_dir=config.VIDEO_LEARNING_OUTPUT_DIR,
|
||||
enable_analysis=request.enable_analysis,
|
||||
)
|
||||
|
||||
# Setup callbacks for real-time updates
|
||||
@@ -180,45 +182,53 @@ async def start_session(session_id: str) -> Dict[str, str]:
|
||||
app_name = info["name"]
|
||||
|
||||
# Build clear task instructions
|
||||
# 重要:不让 VLM 自己计数,而是持续执行直到被外部停止
|
||||
# 这样可以确保 VideoLearningAgent 的计数与实际观看数一致
|
||||
if category:
|
||||
task = f"""你是一个视频学习助手。请严格按照以下步骤执行:
|
||||
task = f"""你是一个视频学习助手。你的任务是持续观看视频,不要自己判断任务是否完成。
|
||||
|
||||
步骤1:启动应用
|
||||
- 回到主屏幕
|
||||
- 打开{app_name}应用
|
||||
|
||||
步骤2:搜索内容
|
||||
- 在{app_name}中搜索"{category}"
|
||||
- 点击第一个搜索结果或进入相关页面
|
||||
|
||||
步骤3:观看视频
|
||||
- 观看视频,每个视频停留约{watch_duration}秒
|
||||
- 记录视频的描述、点赞数、评论数
|
||||
【核心要求】你必须持续执行以下循环,直到系统主动停止你:
|
||||
- 观看视频约{watch_duration}秒
|
||||
- 向上滑动切换到下一个视频
|
||||
- 重复观看和记录,直到完成{target_count}个视频
|
||||
- 继续观看
|
||||
- 再次滑动
|
||||
- 一直重复,不要停止
|
||||
|
||||
步骤4:完成任务
|
||||
- 完成观看{target_count}个视频后,总结所有视频信息
|
||||
执行步骤:
|
||||
1. 回到主屏幕,打开{app_name}应用
|
||||
2. 在{app_name}中搜索"{category}"
|
||||
3. 点击第一个搜索结果
|
||||
4. 开始无限循环:观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
|
||||
|
||||
请现在开始执行。"""
|
||||
【极其重要】
|
||||
- 绝对不要自己完成任务或停止
|
||||
- 绝对不要总结或说"完成了"
|
||||
- 只管滑动看视频,持续不断
|
||||
- 系统会在合适的时机主动停止你
|
||||
|
||||
现在开始执行,持续滑动观看视频,不要停!"""
|
||||
else:
|
||||
task = f"""你是一个视频学习助手。请严格按照以下步骤执行:
|
||||
task = f"""你是一个视频学习助手。你的任务是持续观看视频,不要自己判断任务是否完成。
|
||||
|
||||
步骤1:启动应用
|
||||
- 回到主屏幕
|
||||
- 打开{app_name}应用
|
||||
|
||||
步骤2:观看推荐视频
|
||||
- 进入{app_name}的推荐页面
|
||||
- 观看推荐视频,每个视频停留约{watch_duration}秒
|
||||
- 记录视频的描述、点赞数、评论数
|
||||
【核心要求】你必须持续执行以下循环,直到系统主动停止你:
|
||||
- 观看视频约{watch_duration}秒
|
||||
- 向上滑动切换到下一个视频
|
||||
- 重复观看和记录,直到完成{target_count}个视频
|
||||
- 继续观看
|
||||
- 再次滑动
|
||||
- 一直重复,不要停止
|
||||
|
||||
步骤3:完成任务
|
||||
- 完成观看{target_count}个视频后,总结所有视频信息
|
||||
执行步骤:
|
||||
1. 回到主屏幕,打开{app_name}应用
|
||||
2. 进入推荐页面
|
||||
3. 开始无限循环:观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
|
||||
|
||||
请现在开始执行。"""
|
||||
【极其重要】
|
||||
- 绝对不要自己完成任务或停止
|
||||
- 绝对不要总结或说"完成了"
|
||||
- 只管滑动看视频,持续不断
|
||||
- 系统会在合适的时机主动停止你
|
||||
|
||||
现在开始执行,持续滑动观看视频,不要停!"""
|
||||
|
||||
# Run in background
|
||||
asyncio.create_task(asyncio.to_thread(agent.run_learning_task, task))
|
||||
@@ -244,8 +254,8 @@ async def control_session(
|
||||
return {"session_id": session_id, "status": "resumed"}
|
||||
elif request.action == "stop":
|
||||
agent.stop_session()
|
||||
# Remove from active sessions
|
||||
del _active_sessions[session_id]
|
||||
# Don't delete immediately - let status queries still work
|
||||
# Session will be cleaned up when is_active becomes False
|
||||
return {"session_id": session_id, "status": "stopped"}
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid action: {request.action}")
|
||||
|
||||
@@ -19,6 +19,7 @@ const VideoLearningModule = {
|
||||
targetCount = 10,
|
||||
category = null,
|
||||
watchDuration = 3.0,
|
||||
enableAnalysis = true,
|
||||
} = options;
|
||||
|
||||
try {
|
||||
@@ -28,6 +29,7 @@ const VideoLearningModule = {
|
||||
target_count: targetCount,
|
||||
category: category,
|
||||
watch_duration: watchDuration,
|
||||
enable_analysis: enableAnalysis,
|
||||
});
|
||||
|
||||
this.currentSessionId = response.data.session_id;
|
||||
|
||||
@@ -97,6 +97,14 @@
|
||||
<small>Leave empty to watch recommended videos</small>
|
||||
</div>
|
||||
|
||||
<div class="form-group checkbox-group">
|
||||
<label>
|
||||
<input type="checkbox" v-model="config.enableAnalysis" :disabled="loading">
|
||||
<span>Enable Screenshot Analysis</span>
|
||||
</label>
|
||||
<small>Analyze video content using VLM to extract description, likes, comments, tags, etc.</small>
|
||||
</div>
|
||||
|
||||
<button @click="createAndStartSession" class="btn btn-primary" :disabled="loading || !config.deviceId">
|
||||
<svg v-if="loading" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="spinning">
|
||||
<path d="M21 12a9 9 0 1 1-6.219-8.56"></path>
|
||||
@@ -269,6 +277,7 @@
|
||||
targetCount: 10,
|
||||
category: '',
|
||||
watchDuration: 3.0,
|
||||
enableAnalysis: true,
|
||||
},
|
||||
};
|
||||
},
|
||||
@@ -304,6 +313,7 @@
|
||||
targetCount: this.config.targetCount,
|
||||
category: this.config.category || null,
|
||||
watchDuration: this.config.watchDuration,
|
||||
enableAnalysis: this.config.enableAnalysis,
|
||||
}
|
||||
);
|
||||
|
||||
@@ -354,7 +364,12 @@
|
||||
|
||||
try {
|
||||
await VideoLearningModule.controlSession(this.currentSessionId, 'stop');
|
||||
await this.updateSessionStatus();
|
||||
// Stop polling first
|
||||
VideoLearningModule.stopPolling();
|
||||
// Mark session as stopped in UI
|
||||
if (this.sessionStatus) {
|
||||
this.sessionStatus.is_active = false;
|
||||
}
|
||||
this.showToast('Session stopped', 'info');
|
||||
} catch (error) {
|
||||
this.showToast('Failed to stop session', 'error');
|
||||
|
||||
@@ -132,7 +132,8 @@ class ScreenshotAnalyzer:
|
||||
try:
|
||||
# 调用 VLM
|
||||
response = self.model_client.request(messages)
|
||||
result_text = response.content.strip()
|
||||
# ModelResponse 使用 raw_content 而不是 content
|
||||
result_text = response.raw_content.strip()
|
||||
|
||||
# 解析 JSON
|
||||
return self._parse_result(result_text)
|
||||
@@ -209,6 +210,7 @@ class VideoLearningAgent:
|
||||
model_config: ModelConfig,
|
||||
platform: str = "douyin",
|
||||
output_dir: str = "./video_learning_data",
|
||||
enable_analysis: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize Video Learning Agent.
|
||||
@@ -217,11 +219,13 @@ class VideoLearningAgent:
|
||||
model_config: Model configuration for VLM
|
||||
platform: Platform name (douyin, kuaishou, tiktok)
|
||||
output_dir: Directory to save screenshots and data
|
||||
enable_analysis: Whether to enable VLM screenshot analysis
|
||||
"""
|
||||
self.model_config = model_config
|
||||
self.platform = platform
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.enable_analysis = enable_analysis # 画面分析开关
|
||||
|
||||
# Create screenshots subdirectory
|
||||
self.screenshot_dir = self.output_dir / "screenshots"
|
||||
@@ -239,20 +243,23 @@ class VideoLearningAgent:
|
||||
self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
|
||||
self.on_progress_update: Optional[Callable[[int, int], None]] = None
|
||||
|
||||
# Video detection: track screenshot changes (simplified)
|
||||
self._last_screenshot_hash: Optional[str] = None
|
||||
# Video detection: 基于动作检测
|
||||
self._first_video_recorded: bool = False # 是否已记录首视频
|
||||
|
||||
# Skip app startup screens
|
||||
self._in_app_steps: int = 0
|
||||
self._warmup_steps: int = 3 # Skip first 3 steps after entering app
|
||||
|
||||
# Screenshot analyzer for content extraction
|
||||
# Screenshot analyzer for content extraction (only if enabled)
|
||||
self._analyzer: Optional[ScreenshotAnalyzer] = None
|
||||
try:
|
||||
self._analyzer = ScreenshotAnalyzer(model_config)
|
||||
print("[VideoLearning] Screenshot analyzer initialized")
|
||||
except Exception as e:
|
||||
print(f"[VideoLearning] Analyzer init failed: {e}")
|
||||
if self.enable_analysis:
|
||||
try:
|
||||
self._analyzer = ScreenshotAnalyzer(model_config)
|
||||
print("[VideoLearning] Screenshot analyzer initialized")
|
||||
except Exception as e:
|
||||
print(f"[VideoLearning] Analyzer init failed: {e}")
|
||||
else:
|
||||
print("[VideoLearning] Screenshot analysis disabled")
|
||||
|
||||
def start_session(
|
||||
self,
|
||||
@@ -360,10 +367,11 @@ class VideoLearningAgent:
|
||||
"""
|
||||
Callback after each step.
|
||||
|
||||
Simplified logic:
|
||||
1. Check if we're in the target app using get_current_app()
|
||||
2. Detect screenshot changes
|
||||
3. Record video when screenshot changes
|
||||
基于动作检测的逻辑:
|
||||
1. 检测是否在目标 APP 中
|
||||
2. Warmup 阶段跳过
|
||||
3. Warmup 结束后记录首视频
|
||||
4. 检测滑动动作,滑动后记录新视频
|
||||
|
||||
Args:
|
||||
result: Step execution result
|
||||
@@ -409,8 +417,9 @@ class VideoLearningAgent:
|
||||
is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)
|
||||
|
||||
if not is_in_target:
|
||||
# Reset warmup counter when leaving app
|
||||
# Reset counters when leaving app
|
||||
self._in_app_steps = 0
|
||||
self._first_video_recorded = False
|
||||
print(f"[VideoLearning] Not in target app: {current_app}")
|
||||
return None
|
||||
|
||||
@@ -420,44 +429,52 @@ class VideoLearningAgent:
|
||||
print(f"[VideoLearning] Warmup step {self._in_app_steps}/{self._warmup_steps}, skipping...")
|
||||
return None
|
||||
|
||||
# Get screenshot
|
||||
# 获取截图用于记录
|
||||
screenshot = get_device_factory().get_screenshot(self._device_id)
|
||||
|
||||
# Use full base64 data for hash (more sensitive)
|
||||
current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
|
||||
# 首视频记录:warmup 结束后立即记录第一个视频
|
||||
if not self._first_video_recorded:
|
||||
self._first_video_recorded = True
|
||||
self._record_video_from_screenshot(screenshot)
|
||||
print(f"[VideoLearning] ✓ Recorded first video {self.video_counter}/{self.current_session.target_count}")
|
||||
return self._check_target_reached()
|
||||
|
||||
# Detect screenshot change and record video
|
||||
if self._last_screenshot_hash is None:
|
||||
# First screenshot in target app - record first video
|
||||
self._last_screenshot_hash = current_hash
|
||||
# 基于动作检测:检测滑动动作
|
||||
action = result.action
|
||||
action_type = action.get("action") if action else None
|
||||
|
||||
# 调试日志:打印当前动作
|
||||
if action_type:
|
||||
print(f"[VideoLearning] Current action: {action_type}")
|
||||
|
||||
# 检查滑动动作(忽略大小写)
|
||||
if action and action_type and action_type.lower() == "swipe":
|
||||
# VLM 执行了滑动,记录新视频
|
||||
print(f"[VideoLearning] Detected swipe action, recording new video...")
|
||||
self._record_video_from_screenshot(screenshot)
|
||||
print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
|
||||
return self._check_target_reached()
|
||||
|
||||
# Check if we've reached target after recording
|
||||
if self.video_counter >= self.current_session.target_count:
|
||||
print(f"[VideoLearning] ✓ Target reached! Stopping...")
|
||||
self.current_session.is_active = False
|
||||
self._save_session()
|
||||
return "stop"
|
||||
|
||||
elif current_hash != self._last_screenshot_hash:
|
||||
# Screenshot changed - record new video
|
||||
self._last_screenshot_hash = current_hash
|
||||
self._record_video_from_screenshot(screenshot)
|
||||
print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
|
||||
|
||||
# Check if we've reached target after recording
|
||||
if self.video_counter >= self.current_session.target_count:
|
||||
print(f"[VideoLearning] ✓ Target reached! Stopping...")
|
||||
self.current_session.is_active = False
|
||||
self._save_session()
|
||||
return "stop"
|
||||
# 如果不是滑动动作,打印提示
|
||||
if action_type and action_type.lower() != "swipe":
|
||||
print(f"[VideoLearning] Non-swipe action detected ({action_type}), waiting for swipe...")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[VideoLearning] Warning: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return None
|
||||
|
||||
def _check_target_reached(self) -> Optional[str]:
|
||||
"""检查是否达到目标数量"""
|
||||
if self.video_counter >= self.current_session.target_count:
|
||||
print(f"[VideoLearning] ✓ Target reached! Stopping...")
|
||||
self.current_session.is_active = False
|
||||
self._save_session()
|
||||
return "stop"
|
||||
return None
|
||||
|
||||
def _record_video_from_screenshot(self, screenshot):
|
||||
"""Helper method to record video from screenshot with analysis."""
|
||||
import base64
|
||||
|
||||
Reference in New Issue
Block a user