Improve Video Learning Agent with action-based detection and analysis toggle
- Change video detection from screenshot hash to action-based (Swipe detection) - Add enable_analysis toggle to disable VLM screenshot analysis - Improve task prompt to prevent VLM from stopping prematurely - Add debug logging for action detection troubleshooting - Fix ModelResponse attribute error (content -> raw_content) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,7 @@ class SessionCreateRequest(BaseModel):
|
||||
target_count: int = Field(10, description="Number of videos to watch", ge=1, le=100)
|
||||
category: Optional[str] = Field(None, description="Target category filter")
|
||||
watch_duration: float = Field(3.0, description="Watch duration per video (seconds)", ge=1.0, le=30.0)
|
||||
enable_analysis: bool = Field(True, description="Enable VLM screenshot analysis")
|
||||
|
||||
|
||||
class SessionControlRequest(BaseModel):
|
||||
@@ -100,6 +101,7 @@ async def create_session(
|
||||
model_config=model_config,
|
||||
platform=request.platform,
|
||||
output_dir=config.VIDEO_LEARNING_OUTPUT_DIR,
|
||||
enable_analysis=request.enable_analysis,
|
||||
)
|
||||
|
||||
# Setup callbacks for real-time updates
|
||||
@@ -180,45 +182,53 @@ async def start_session(session_id: str) -> Dict[str, str]:
|
||||
app_name = info["name"]
|
||||
|
||||
# Build clear task instructions
|
||||
# 重要:不让 VLM 自己计数,而是持续执行直到被外部停止
|
||||
# 这样可以确保 VideoLearningAgent 的计数与实际观看数一致
|
||||
if category:
|
||||
task = f"""你是一个视频学习助手。请严格按照以下步骤执行:
|
||||
task = f"""你是一个视频学习助手。你的任务是持续观看视频,不要自己判断任务是否完成。
|
||||
|
||||
步骤1:启动应用
|
||||
- 回到主屏幕
|
||||
- 打开{app_name}应用
|
||||
|
||||
步骤2:搜索内容
|
||||
- 在{app_name}中搜索"{category}"
|
||||
- 点击第一个搜索结果或进入相关页面
|
||||
|
||||
步骤3:观看视频
|
||||
- 观看视频,每个视频停留约{watch_duration}秒
|
||||
- 记录视频的描述、点赞数、评论数
|
||||
【核心要求】你必须持续执行以下循环,直到系统主动停止你:
|
||||
- 观看视频约{watch_duration}秒
|
||||
- 向上滑动切换到下一个视频
|
||||
- 重复观看和记录,直到完成{target_count}个视频
|
||||
- 继续观看
|
||||
- 再次滑动
|
||||
- 一直重复,不要停止
|
||||
|
||||
步骤4:完成任务
|
||||
- 完成观看{target_count}个视频后,总结所有视频信息
|
||||
执行步骤:
|
||||
1. 回到主屏幕,打开{app_name}应用
|
||||
2. 在{app_name}中搜索"{category}"
|
||||
3. 点击第一个搜索结果
|
||||
4. 开始无限循环:观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
|
||||
|
||||
请现在开始执行。"""
|
||||
【极其重要】
|
||||
- 绝对不要自己完成任务或停止
|
||||
- 绝对不要总结或说"完成了"
|
||||
- 只管滑动看视频,持续不断
|
||||
- 系统会在合适的时机主动停止你
|
||||
|
||||
现在开始执行,持续滑动观看视频,不要停!"""
|
||||
else:
|
||||
task = f"""你是一个视频学习助手。请严格按照以下步骤执行:
|
||||
task = f"""你是一个视频学习助手。你的任务是持续观看视频,不要自己判断任务是否完成。
|
||||
|
||||
步骤1:启动应用
|
||||
- 回到主屏幕
|
||||
- 打开{app_name}应用
|
||||
|
||||
步骤2:观看推荐视频
|
||||
- 进入{app_name}的推荐页面
|
||||
- 观看推荐视频,每个视频停留约{watch_duration}秒
|
||||
- 记录视频的描述、点赞数、评论数
|
||||
【核心要求】你必须持续执行以下循环,直到系统主动停止你:
|
||||
- 观看视频约{watch_duration}秒
|
||||
- 向上滑动切换到下一个视频
|
||||
- 重复观看和记录,直到完成{target_count}个视频
|
||||
- 继续观看
|
||||
- 再次滑动
|
||||
- 一直重复,不要停止
|
||||
|
||||
步骤3:完成任务
|
||||
- 完成观看{target_count}个视频后,总结所有视频信息
|
||||
执行步骤:
|
||||
1. 回到主屏幕,打开{app_name}应用
|
||||
2. 进入推荐页面
|
||||
3. 开始无限循环:观看视频 → 向上滑动 → 观看下一个视频 → 再次滑动...
|
||||
|
||||
请现在开始执行。"""
|
||||
【极其重要】
|
||||
- 绝对不要自己完成任务或停止
|
||||
- 绝对不要总结或说"完成了"
|
||||
- 只管滑动看视频,持续不断
|
||||
- 系统会在合适的时机主动停止你
|
||||
|
||||
现在开始执行,持续滑动观看视频,不要停!"""
|
||||
|
||||
# Run in background
|
||||
asyncio.create_task(asyncio.to_thread(agent.run_learning_task, task))
|
||||
@@ -244,8 +254,8 @@ async def control_session(
|
||||
return {"session_id": session_id, "status": "resumed"}
|
||||
elif request.action == "stop":
|
||||
agent.stop_session()
|
||||
# Remove from active sessions
|
||||
del _active_sessions[session_id]
|
||||
# Don't delete immediately - let status queries still work
|
||||
# Session will be cleaned up when is_active becomes False
|
||||
return {"session_id": session_id, "status": "stopped"}
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid action: {request.action}")
|
||||
|
||||
@@ -19,6 +19,7 @@ const VideoLearningModule = {
|
||||
targetCount = 10,
|
||||
category = null,
|
||||
watchDuration = 3.0,
|
||||
enableAnalysis = true,
|
||||
} = options;
|
||||
|
||||
try {
|
||||
@@ -28,6 +29,7 @@ const VideoLearningModule = {
|
||||
target_count: targetCount,
|
||||
category: category,
|
||||
watch_duration: watchDuration,
|
||||
enable_analysis: enableAnalysis,
|
||||
});
|
||||
|
||||
this.currentSessionId = response.data.session_id;
|
||||
|
||||
@@ -97,6 +97,14 @@
|
||||
<small>Leave empty to watch recommended videos</small>
|
||||
</div>
|
||||
|
||||
<div class="form-group checkbox-group">
|
||||
<label>
|
||||
<input type="checkbox" v-model="config.enableAnalysis" :disabled="loading">
|
||||
<span>Enable Screenshot Analysis</span>
|
||||
</label>
|
||||
<small>Analyze video content using VLM to extract description, likes, comments, tags, etc.</small>
|
||||
</div>
|
||||
|
||||
<button @click="createAndStartSession" class="btn btn-primary" :disabled="loading || !config.deviceId">
|
||||
<svg v-if="loading" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="spinning">
|
||||
<path d="M21 12a9 9 0 1 1-6.219-8.56"></path>
|
||||
@@ -269,6 +277,7 @@
|
||||
targetCount: 10,
|
||||
category: '',
|
||||
watchDuration: 3.0,
|
||||
enableAnalysis: true,
|
||||
},
|
||||
};
|
||||
},
|
||||
@@ -304,6 +313,7 @@
|
||||
targetCount: this.config.targetCount,
|
||||
category: this.config.category || null,
|
||||
watchDuration: this.config.watchDuration,
|
||||
enableAnalysis: this.config.enableAnalysis,
|
||||
}
|
||||
);
|
||||
|
||||
@@ -354,7 +364,12 @@
|
||||
|
||||
try {
|
||||
await VideoLearningModule.controlSession(this.currentSessionId, 'stop');
|
||||
await this.updateSessionStatus();
|
||||
// Stop polling first
|
||||
VideoLearningModule.stopPolling();
|
||||
// Mark session as stopped in UI
|
||||
if (this.sessionStatus) {
|
||||
this.sessionStatus.is_active = false;
|
||||
}
|
||||
this.showToast('Session stopped', 'info');
|
||||
} catch (error) {
|
||||
this.showToast('Failed to stop session', 'error');
|
||||
|
||||
Reference in New Issue
Block a user