Improve Video Learning Agent with action-based detection and analysis toggle

- Change video detection from screenshot hash to action-based (Swipe detection)
- Add enable_analysis toggle to disable VLM screenshot analysis
- Improve task prompt to prevent VLM from stopping prematurely
- Add debug logging for action detection troubleshooting
- Fix ModelResponse attribute error (content -> raw_content)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
let5sne.win10
2026-01-10 01:47:09 +08:00
parent 6b770832aa
commit b97d3f3a9f
4 changed files with 116 additions and 72 deletions

View File

@@ -26,6 +26,7 @@ class SessionCreateRequest(BaseModel):
target_count: int = Field(10, description="Number of videos to watch", ge=1, le=100)
category: Optional[str] = Field(None, description="Target category filter")
watch_duration: float = Field(3.0, description="Watch duration per video (seconds)", ge=1.0, le=30.0)
enable_analysis: bool = Field(True, description="Enable VLM screenshot analysis")
class SessionControlRequest(BaseModel):
@@ -100,6 +101,7 @@ async def create_session(
model_config=model_config,
platform=request.platform,
output_dir=config.VIDEO_LEARNING_OUTPUT_DIR,
enable_analysis=request.enable_analysis,
)
# Setup callbacks for real-time updates
@@ -180,45 +182,53 @@ async def start_session(session_id: str) -> Dict[str, str]:
app_name = info["name"]
# Build clear task instructions
# 重要:不让 VLM 自己计数,而是持续执行直到被外部停止
# 这样可以确保 VideoLearningAgent 的计数与实际观看数一致
if category:
task = f"""你是一个视频学习助手。请严格按照以下步骤执行:
task = f"""你是一个视频学习助手。你的任务是持续观看视频,不要自己判断任务是否完成。
步骤1启动应用
- 回到主屏幕
- 打开{app_name}应用
步骤2搜索内容
- 在{app_name}中搜索"{category}"
- 点击第一个搜索结果或进入相关页面
步骤3观看视频
- 观看视频,每个视频停留约{watch_duration}
- 记录视频的描述、点赞数、评论数
【核心要求】你必须持续执行以下循环,直到系统主动停止你:
- 观看视频约{watch_duration}
- 向上滑动切换到下一个视频
- 重复观看和记录,直到完成{target_count}个视频
- 继续观看
- 再次滑动
- 一直重复,不要停止
步骤4完成任务
- 完成观看{target_count}个视频后,总结所有视频信息
执行步骤:
1. 回到主屏幕,打开{app_name}应用
2. 在{app_name}中搜索"{category}"
3. 点击第一个搜索结果
4. 开始无限循环:观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
请现在开始执行。"""
【极其重要】
- 绝对不要自己完成任务或停止
- 绝对不要总结或说"完成了"
- 只管滑动看视频,持续不断
- 系统会在合适的时机主动停止你
现在开始执行,持续滑动观看视频,不要停!"""
else:
task = f"""你是一个视频学习助手。请严格按照以下步骤执行:
task = f"""你是一个视频学习助手。你的任务是持续观看视频,不要自己判断任务是否完成。
步骤1启动应用
- 回到主屏幕
- 打开{app_name}应用
步骤2观看推荐视频
- 进入{app_name}的推荐页面
- 观看推荐视频,每个视频停留约{watch_duration}
- 记录视频的描述、点赞数、评论数
【核心要求】你必须持续执行以下循环,直到系统主动停止你:
- 观看视频约{watch_duration}
- 向上滑动切换到下一个视频
- 重复观看和记录,直到完成{target_count}个视频
- 继续观看
- 再次滑动
- 一直重复,不要停止
步骤3完成任务
- 完成观看{target_count}个视频后,总结所有视频信息
执行步骤:
1. 回到主屏幕,打开{app_name}应用
2. 进入推荐页面
3. 开始无限循环:观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
请现在开始执行。"""
【极其重要】
- 绝对不要自己完成任务或停止
- 绝对不要总结或说"完成了"
- 只管滑动看视频,持续不断
- 系统会在合适的时机主动停止你
现在开始执行,持续滑动观看视频,不要停!"""
# Run in background
asyncio.create_task(asyncio.to_thread(agent.run_learning_task, task))
@@ -244,8 +254,8 @@ async def control_session(
return {"session_id": session_id, "status": "resumed"}
elif request.action == "stop":
agent.stop_session()
# Remove from active sessions
del _active_sessions[session_id]
# Don't delete immediately - let status queries still work
# Session will be cleaned up when is_active becomes False
return {"session_id": session_id, "status": "stopped"}
else:
raise HTTPException(status_code=400, detail=f"Invalid action: {request.action}")

View File

@@ -19,6 +19,7 @@ const VideoLearningModule = {
targetCount = 10,
category = null,
watchDuration = 3.0,
enableAnalysis = true,
} = options;
try {
@@ -28,6 +29,7 @@ const VideoLearningModule = {
target_count: targetCount,
category: category,
watch_duration: watchDuration,
enable_analysis: enableAnalysis,
});
this.currentSessionId = response.data.session_id;

View File

@@ -97,6 +97,14 @@
<small>Leave empty to watch recommended videos</small>
</div>
<div class="form-group checkbox-group">
<label>
<input type="checkbox" v-model="config.enableAnalysis" :disabled="loading">
<span>Enable Screenshot Analysis</span>
</label>
<small>Analyze video content using VLM to extract description, likes, comments, tags, etc.</small>
</div>
<button @click="createAndStartSession" class="btn btn-primary" :disabled="loading || !config.deviceId">
<svg v-if="loading" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="spinning">
<path d="M21 12a9 9 0 1 1-6.219-8.56"></path>
@@ -269,6 +277,7 @@
targetCount: 10,
category: '',
watchDuration: 3.0,
enableAnalysis: true,
},
};
},
@@ -304,6 +313,7 @@
targetCount: this.config.targetCount,
category: this.config.category || null,
watchDuration: this.config.watchDuration,
enableAnalysis: this.config.enableAnalysis,
}
);
@@ -354,7 +364,12 @@
try {
await VideoLearningModule.controlSession(this.currentSessionId, 'stop');
await this.updateSessionStatus();
// Stop polling first
VideoLearningModule.stopPolling();
// Mark session as stopped in UI
if (this.sessionStatus) {
this.sessionStatus.is_active = false;
}
this.showToast('Session stopped', 'info');
} catch (error) {
this.showToast('Failed to stop session', 'error');

View File

@@ -132,7 +132,8 @@ class ScreenshotAnalyzer:
try:
# 调用 VLM
response = self.model_client.request(messages)
result_text = response.content.strip()
# ModelResponse 使用 raw_content 而不是 content
result_text = response.raw_content.strip()
# 解析 JSON
return self._parse_result(result_text)
@@ -209,6 +210,7 @@ class VideoLearningAgent:
model_config: ModelConfig,
platform: str = "douyin",
output_dir: str = "./video_learning_data",
enable_analysis: bool = True,
):
"""
Initialize Video Learning Agent.
@@ -217,11 +219,13 @@ class VideoLearningAgent:
model_config: Model configuration for VLM
platform: Platform name (douyin, kuaishou, tiktok)
output_dir: Directory to save screenshots and data
enable_analysis: Whether to enable VLM screenshot analysis
"""
self.model_config = model_config
self.platform = platform
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.enable_analysis = enable_analysis # 画面分析开关
# Create screenshots subdirectory
self.screenshot_dir = self.output_dir / "screenshots"
@@ -239,20 +243,23 @@ class VideoLearningAgent:
self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
self.on_progress_update: Optional[Callable[[int, int], None]] = None
# Video detection: track screenshot changes (simplified)
self._last_screenshot_hash: Optional[str] = None
# Video detection: 基于动作检测
self._first_video_recorded: bool = False # 是否已记录首视频
# Skip app startup screens
self._in_app_steps: int = 0
self._warmup_steps: int = 3 # Skip first 3 steps after entering app
# Screenshot analyzer for content extraction
# Screenshot analyzer for content extraction (only if enabled)
self._analyzer: Optional[ScreenshotAnalyzer] = None
try:
self._analyzer = ScreenshotAnalyzer(model_config)
print("[VideoLearning] Screenshot analyzer initialized")
except Exception as e:
print(f"[VideoLearning] Analyzer init failed: {e}")
if self.enable_analysis:
try:
self._analyzer = ScreenshotAnalyzer(model_config)
print("[VideoLearning] Screenshot analyzer initialized")
except Exception as e:
print(f"[VideoLearning] Analyzer init failed: {e}")
else:
print("[VideoLearning] Screenshot analysis disabled")
def start_session(
self,
@@ -360,10 +367,11 @@ class VideoLearningAgent:
"""
Callback after each step.
Simplified logic:
1. Check if we're in the target app using get_current_app()
2. Detect screenshot changes
3. Record video when screenshot changes
基于动作检测的逻辑:
1. 检测是否在目标 APP 中
2. Warmup 阶段跳过
3. Warmup 结束后记录首视频
4. 检测滑动动作,滑动后记录新视频
Args:
result: Step execution result
@@ -409,8 +417,9 @@ class VideoLearningAgent:
is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)
if not is_in_target:
# Reset warmup counter when leaving app
# Reset counters when leaving app
self._in_app_steps = 0
self._first_video_recorded = False
print(f"[VideoLearning] Not in target app: {current_app}")
return None
@@ -420,44 +429,52 @@ class VideoLearningAgent:
print(f"[VideoLearning] Warmup step {self._in_app_steps}/{self._warmup_steps}, skipping...")
return None
# Get screenshot
# 获取截图用于记录
screenshot = get_device_factory().get_screenshot(self._device_id)
# Use full base64 data for hash (more sensitive)
current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
# 首视频记录warmup 结束后立即记录第一个视频
if not self._first_video_recorded:
self._first_video_recorded = True
self._record_video_from_screenshot(screenshot)
print(f"[VideoLearning] ✓ Recorded first video {self.video_counter}/{self.current_session.target_count}")
return self._check_target_reached()
# Detect screenshot change and record video
if self._last_screenshot_hash is None:
# First screenshot in target app - record first video
self._last_screenshot_hash = current_hash
# 基于动作检测:检测滑动动作
action = result.action
action_type = action.get("action") if action else None
# 调试日志:打印当前动作
if action_type:
print(f"[VideoLearning] Current action: {action_type}")
# 检查滑动动作(忽略大小写)
if action and action_type and action_type.lower() == "swipe":
# VLM 执行了滑动,记录新视频
print(f"[VideoLearning] Detected swipe action, recording new video...")
self._record_video_from_screenshot(screenshot)
print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
return self._check_target_reached()
# Check if we've reached target after recording
if self.video_counter >= self.current_session.target_count:
print(f"[VideoLearning] ✓ Target reached! Stopping...")
self.current_session.is_active = False
self._save_session()
return "stop"
elif current_hash != self._last_screenshot_hash:
# Screenshot changed - record new video
self._last_screenshot_hash = current_hash
self._record_video_from_screenshot(screenshot)
print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
# Check if we've reached target after recording
if self.video_counter >= self.current_session.target_count:
print(f"[VideoLearning] ✓ Target reached! Stopping...")
self.current_session.is_active = False
self._save_session()
return "stop"
# 如果不是滑动动作,打印提示
if action_type and action_type.lower() != "swipe":
print(f"[VideoLearning] Non-swipe action detected ({action_type}), waiting for swipe...")
except Exception as e:
print(f"[VideoLearning] Warning: {e}")
import traceback
traceback.print_exc()
return None
def _check_target_reached(self) -> Optional[str]:
"""检查是否达到目标数量"""
if self.video_counter >= self.current_session.target_count:
print(f"[VideoLearning] ✓ Target reached! Stopping...")
self.current_session.is_active = False
self._save_session()
return "stop"
return None
def _record_video_from_screenshot(self, screenshot):
"""Helper method to record video from screenshot with analysis."""
import base64