Add Video Learning Agent for short video platforms

Features: - VideoLearningAgent for automated video watching on Douyin/Kuaishou/TikTok - Web dashboard UI for video learning sessions - Real-time progress tracking with screenshot capture - App detection using get_current_app() for accurate recording - Session management with pause/resume/stop controls Technical improvements: - Simplified video detection logic using direct app detection - Full base64 hash for sensitive screenshot change detection - Immediate stop when target video count is reached - Fixed circular import issues with ModelConfig Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 22:54:57 +08:00
parent 3552df23d6
commit 5b3f214e20
15 changed files with 2317 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@@ -108,3 +108,16 @@ SCREENSHOT_THROTTLE_MS=500
 # Maximum task history to keep / 保留的最大任务历史数
 MAX_TASK_HISTORY=100
 # ============================================================================
 # Video Learning Configuration / 视频学习配置
 # ============================================================================
 # Output directory for video learning data / 视频学习数据输出目录
 VIDEO_LEARNING_OUTPUT_DIR=./video_learning_data
 # Model parameters for video learning / 视频学习模型参数
 PHONE_AGENT_MAX_TOKENS=3000
 PHONE_AGENT_TEMPERATURE=0.0
 PHONE_AGENT_TOP_P=0.85
 PHONE_AGENT_FREQUENCY_PENALTY=0.2
--- a/dashboard/api/init.py
+++ b/dashboard/api/init.py
@@ -5,9 +5,11 @@ API endpoints for the dashboard.
 from dashboard.api.devices import router as devices_router
 from dashboard.api.tasks import router as tasks_router
 from dashboard.api.websocket import router as websocket_router
 from dashboard.api.video_learning import router as video_learning_router
 __all__ = [
    "devices_router",
    "tasks_router",
    "websocket_router",
    "video_learning_router",
 ]
--- a/dashboard/api/video_learning.py
+++ b/dashboard/api/video_learning.py
@@ -0,0 +1,328 @@
 """
 Video Learning API endpoints for the dashboard.
 """
 import asyncio
 from datetime import datetime
 from typing import Dict, List, Optional
 from fastapi import APIRouter, Depends, HTTPException
 from pydantic import BaseModel, Field
 from dashboard.config import config
 from dashboard.dependencies import get_device_manager
 from dashboard.services.device_manager import DeviceManager
 from phone_agent import VideoLearningAgent
 from phone_agent.model.client import ModelConfig
 router = APIRouter(prefix="/api/video-learning", tags=["video-learning"])
 class SessionCreateRequest(BaseModel):
    """Request to create a new learning session."""
    device_id: str = Field(..., description="Target device ID")
    platform: str = Field("douyin", description="Platform name (douyin, kuaishou, tiktok)")
    target_count: int = Field(10, description="Number of videos to watch", ge=1, le=100)
    category: Optional[str] = Field(None, description="Target category filter")
    watch_duration: float = Field(3.0, description="Watch duration per video (seconds)", ge=1.0, le=30.0)
 class SessionControlRequest(BaseModel):
    """Request to control a session."""
    action: str = Field(..., description="Action: pause, resume, stop")
 class SessionStatus(BaseModel):
    """Session status response."""
    session_id: str
    platform: str
    target_count: int
    watched_count: int
    progress_percent: float
    is_active: bool
    is_paused: bool
    total_duration: float
    current_video: Optional[Dict] = None
 class VideoInfo(BaseModel):
    """Information about a watched video."""
    sequence_id: int
    timestamp: str
    screenshot_path: Optional[str] = None
    watch_duration: float
    description: Optional[str] = None
    likes: Optional[int] = None
    comments: Optional[int] = None
    tags: List[str] = []
    category: Optional[str] = None
 # Global session storage (in production, use database)
 _active_sessions: Dict[str, VideoLearningAgent] = {}
@router.post("/sessions", response_model=Dict[str, str])
 async def create_session(
    request: SessionCreateRequest,
    device_manager: DeviceManager = Depends(get_device_manager),
 ) -> Dict[str, str]:
    """Create a new video learning session."""
    # Check device availability
    device = await device_manager.get_device(request.device_id)
    if not device:
        raise HTTPException(status_code=404, detail="Device not found")
    if not device.is_connected:
        raise HTTPException(status_code=400, detail="Device not connected")
    if device.status == "busy":
        raise HTTPException(status_code=409, detail="Device is busy")
    # Create model config from environment
    model_config = ModelConfig(
        base_url=config.MODEL_BASE_URL,
        model_name=config.MODEL_NAME,
        api_key=config.MODEL_API_KEY,
        max_tokens=config.MAX_TOKENS,
        temperature=config.TEMPERATURE,
        top_p=config.TOP_P,
        frequency_penalty=config.FREQUENCY_PENALTY,
        lang="cn",
    )
    # Create video learning agent
    agent = VideoLearningAgent(
        model_config=model_config,
        platform=request.platform,
        output_dir=config.VIDEO_LEARNING_OUTPUT_DIR,
    )
    # Setup callbacks for real-time updates
    session_id = None
    def on_video_watched(record):
        """Callback when a video is watched."""
        # Broadcast via WebSocket
        if session_id:
            # This would be integrated with WebSocket manager
            pass
    def on_progress_update(current, total):
        """Callback for progress updates."""
        if session_id:
            # Broadcast progress
            pass
    def on_session_complete(session):
        """Callback when session completes."""
        if session_id and session_id in _active_sessions:
            del _active_sessions[session_id]
    agent.on_video_watched = on_video_watched
    agent.on_progress_update = on_progress_update
    agent.on_session_complete = on_session_complete
    # Start session
    session_id = agent.start_session(
        device_id=request.device_id,
        target_count=request.target_count,
        category=request.category,
        watch_duration=request.watch_duration,
        max_steps=500,
    )
    # Store session
    _active_sessions[session_id] = agent
    return {"session_id": session_id, "status": "created"}
@router.post("/sessions/{session_id}/start", response_model=Dict[str, str])
 async def start_session(session_id: str) -> Dict[str, str]:
    """Start executing a learning session."""
    if session_id not in _active_sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    agent = _active_sessions[session_id]
    # Build task based on session parameters
    session = agent.current_session
    if not session:
        raise HTTPException(status_code=400, detail="Session not initialized")
    category = session.target_category
    target_count = session.target_count
    watch_duration = agent._watch_duration
    platform = agent.platform
    # Platform-specific app name and package
    platform_info = {
        "douyin": {
            "name": "抖音",
            "package": "com.ss.android.ugc.aweme",
        },
        "kuaishou": {
            "name": "快手",
            "package": "com.smile.gifmaker",
        },
        "tiktok": {
            "name": "TikTok",
            "package": "com.zhiliaoapp.musically",
        },
    }
    info = platform_info.get(platform, platform_info["douyin"])
    app_name = info["name"]
    # Build clear task instructions
    if category:
        task = f"""你是一个视频学习助手。请严格按照以下步骤执行：
 步骤1：启动应用
 - 回到主屏幕
 - 打开{app_name}应用
 步骤2：搜索内容
 - 在{app_name}中搜索"{category}"
 - 点击第一个搜索结果或进入相关页面
 步骤3：观看视频
 - 观看视频，每个视频停留约{watch_duration}秒
 - 记录视频的描述、点赞数、评论数
 - 向上滑动切换到下一个视频
 - 重复观看和记录，直到完成{target_count}个视频
 步骤4：完成任务
 - 完成观看{target_count}个视频后，总结所有视频信息
 请现在开始执行。"""
    else:
        task = f"""你是一个视频学习助手。请严格按照以下步骤执行：
 步骤1：启动应用
 - 回到主屏幕
 - 打开{app_name}应用
 步骤2：观看推荐视频
 - 进入{app_name}的推荐页面
 - 观看推荐视频，每个视频停留约{watch_duration}秒
 - 记录视频的描述、点赞数、评论数
 - 向上滑动切换到下一个视频
 - 重复观看和记录，直到完成{target_count}个视频
 步骤3：完成任务
 - 完成观看{target_count}个视频后，总结所有视频信息
 请现在开始执行。"""
    # Run in background
    asyncio.create_task(asyncio.to_thread(agent.run_learning_task, task))
    return {"session_id": session_id, "status": "started"}
@router.post("/sessions/{session_id}/control", response_model=Dict[str, str])
 async def control_session(
    session_id: str, request: SessionControlRequest
 ) -> Dict[str, str]:
    """Control a learning session (pause/resume/stop)."""
    if session_id not in _active_sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    agent = _active_sessions[session_id]
    if request.action == "pause":
        agent.pause_session()
        return {"session_id": session_id, "status": "paused"}
    elif request.action == "resume":
        agent.resume_session()
        return {"session_id": session_id, "status": "resumed"}
    elif request.action == "stop":
        agent.stop_session()
        # Remove from active sessions
        del _active_sessions[session_id]
        return {"session_id": session_id, "status": "stopped"}
    else:
        raise HTTPException(status_code=400, detail=f"Invalid action: {request.action}")
@router.get("/sessions/{session_id}/status", response_model=SessionStatus)
 async def get_session_status(session_id: str) -> SessionStatus:
    """Get session status."""
    if session_id not in _active_sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    agent = _active_sessions[session_id]
    progress = agent.get_session_progress()
    # Get current video info if available
    current_video = None
    if agent.current_session and agent.current_session.records:
        latest = agent.current_session.records[-1]
        current_video = {
            "sequence_id": latest.sequence_id,
            "timestamp": latest.timestamp,
            "screenshot_path": latest.screenshot_path,
            "description": latest.description,
            "likes": latest.likes,
            "comments": latest.comments,
        }
    return SessionStatus(
        session_id=progress["session_id"],
        platform=progress["platform"],
        target_count=progress["target_count"],
        watched_count=progress["watched_count"],
        progress_percent=progress["progress_percent"],
        is_active=progress["is_active"],
        is_paused=progress["is_paused"],
        total_duration=progress["total_duration"],
        current_video=current_video,
    )
@router.get("/sessions/{session_id}/videos", response_model=List[VideoInfo])
 async def get_session_videos(session_id: str) -> List[VideoInfo]:
    """Get all videos from a session."""
    if session_id not in _active_sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    agent = _active_sessions[session_id]
    if not agent.current_session:
        return []
    return [
        VideoInfo(
            sequence_id=r.sequence_id,
            timestamp=r.timestamp,
            screenshot_path=r.screenshot_path,
            watch_duration=r.watch_duration,
            description=r.description,
            likes=r.likes,
            comments=r.comments,
            tags=r.tags,
            category=r.category,
        )
        for r in agent.current_session.records
    ]
@router.get("/sessions", response_model=List[str])
 async def list_sessions() -> List[str]:
    """List all active session IDs."""
    return list(_active_sessions.keys())
@router.delete("/sessions/{session_id}", response_model=Dict[str, str])
 async def delete_session(session_id: str) -> Dict[str, str]:
    """Delete a session."""
    if session_id not in _active_sessions:
        raise HTTPException(status_code=404, detail="Session not found")
    del _active_sessions[session_id]
    return {"session_id": session_id, "status": "deleted"}
--- a/dashboard/config.py
+++ b/dashboard/config.py
@@ -39,6 +39,13 @@ class DashboardConfig:
    MODEL_BASE_URL: str = os.getenv("PHONE_AGENT_BASE_URL", "http://localhost:8000/v1")
    MODEL_NAME: str = os.getenv("PHONE_AGENT_MODEL", "autoglm-phone-9b")
    MODEL_API_KEY: str = os.getenv("PHONE_AGENT_API_KEY", "EMPTY")
    MAX_TOKENS: int = int(os.getenv("PHONE_AGENT_MAX_TOKENS", "3000"))
    TEMPERATURE: float = float(os.getenv("PHONE_AGENT_TEMPERATURE", "0.0"))
    TOP_P: float = float(os.getenv("PHONE_AGENT_TOP_P", "0.85"))
    FREQUENCY_PENALTY: float = float(os.getenv("PHONE_AGENT_FREQUENCY_PENALTY", "0.2"))
    # Video learning settings
    VIDEO_LEARNING_OUTPUT_DIR: str = os.getenv("VIDEO_LEARNING_OUTPUT_DIR", "./video_learning_data")
    # Task history
    MAX_TASK_HISTORY: int = int(os.getenv("MAX_TASK_HISTORY", "100"))
--- a/dashboard/main.py
+++ b/dashboard/main.py
@@ -16,7 +16,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
-from dashboard.api import devices_router, tasks_router, websocket_router
+from dashboard.api import devices_router, tasks_router, websocket_router, video_learning_router
 from dashboard.config import config
 from dashboard.dependencies import (
    get_device_manager,
@@ -104,6 +104,7 @@ async def global_exception_handler(request: Request, exc: Exception):
 app.include_router(devices_router, prefix="/api")
 app.include_router(tasks_router, prefix="/api")
 app.include_router(websocket_router)
 app.include_router(video_learning_router)
 # Health check
@@ -163,6 +164,12 @@ if static_path.exists():
    app.mount("/static", StaticFiles(directory=str(static_path)), name="static")
 # Mount static files for video learning screenshots
 video_learning_data_path = Path(config.VIDEO_LEARNING_OUTPUT_DIR)
 if video_learning_data_path.exists():
    app.mount("/video-learning-data", StaticFiles(directory=str(video_learning_data_path)), name="video-learning-data")
 # Run script entry point
 if __name__ == "__main__":
    import uvicorn
--- a/dashboard/static/css/video-learning.css
+++ b/dashboard/static/css/video-learning.css
@@ -0,0 +1,283 @@
 /* Video Learning Module Styles */
 /* Header modifications */
 .header h1 {
    display: flex;
    align-items: center;
    gap: 0.75rem;
 }
 /* Configuration Section */
 .config-section {
    background-color: var(--card-bg);
    border: 1px solid var(--border-color);
    border-radius: 12px;
    padding: 2rem;
    max-width: 800px;
    margin: 0 auto;
 }
 .config-form {
    display: flex;
    flex-direction: column;
    gap: 1.5rem;
 }
 .form-group {
    display: flex;
    flex-direction: column;
    gap: 0.5rem;
 }
 .form-group label {
    font-size: 0.875rem;
    font-weight: 500;
    color: var(--text-primary);
 }
 .form-group select,
 .form-group input {
    padding: 0.75rem 1rem;
    background-color: var(--bg-color);
    border: 1px solid var(--border-color);
    border-radius: 8px;
    color: var(--text-primary);
    font-size: 0.95rem;
 }
 .form-group select:focus,
 .form-group input:focus {
    outline: none;
    border-color: var(--primary-color);
 }
 .form-group select:disabled,
 .form-group input:disabled {
    opacity: 0.5;
    cursor: not-allowed;
 }
 .form-group small {
    font-size: 0.75rem;
    color: var(--text-secondary);
 }
 .form-row {
    display: grid;
    grid-template-columns: 1fr 1fr;
    gap: 1rem;
 }
 /* Session Section */
 .session-section {
    background-color: var(--card-bg);
    border: 1px solid var(--border-color);
    border-radius: 12px;
    padding: 2rem;
 }
 .session-header {
    display: flex;
    justify-content: space-between;
    align-items: center;
    margin-bottom: 1.5rem;
 }
 .session-header h2 {
    font-size: 1.25rem;
    font-weight: 600;
    color: var(--text-primary);
 }
 .session-controls {
    display: flex;
    gap: 0.5rem;
 }
 /* Progress Section */
 .progress-section {
    background-color: var(--bg-color);
    border-radius: 8px;
    padding: 1.5rem;
    margin-bottom: 1.5rem;
 }
 .progress-info {
    display: flex;
    justify-content: space-between;
    margin-bottom: 0.5rem;
    font-size: 0.875rem;
    color: var(--text-secondary);
 }
 .progress-bar-large {
    height: 8px;
    background-color: rgba(99, 102, 241, 0.2);
    border-radius: 4px;
    overflow: hidden;
 }
 .progress-fill {
    height: 100%;
    background-color: var(--primary-color);
    transition: width 0.3s ease;
 }
 .progress-stats {
    margin-top: 0.5rem;
    font-size: 0.8rem;
    color: var(--text-secondary);
 }
 /* Current Video */
 .current-video {
    margin-bottom: 2rem;
 }
 .current-video h3 {
    font-size: 1rem;
    font-weight: 600;
    color: var(--text-primary);
    margin-bottom: 1rem;
 }
 /* Video Cards */
 .video-card {
    background-color: var(--bg-color);
    border: 1px solid var(--border-color);
    border-radius: 8px;
    overflow: hidden;
    transition: border-color 0.2s;
 }
 .video-card:hover {
    border-color: var(--primary-color);
 }
 .video-screenshot {
    width: 100%;
    aspect-ratio: 9/16;
    background-color: #000;
    overflow: hidden;
 }
 .video-screenshot img {
    width: 100%;
    height: 100%;
    object-fit: contain;
 }
 .video-placeholder {
    width: 100%;
    aspect-ratio: 9/16;
    background-color: var(--bg-color);
    display: flex;
    align-items: center;
    justify-content: center;
    color: var(--text-secondary);
 }
 .video-info {
    padding: 1rem;
 }
 .video-id {
    font-size: 0.75rem;
    font-weight: 600;
    color: var(--primary-color);
    margin-bottom: 0.5rem;
 }
 .video-description {
    font-size: 0.875rem;
    color: var(--text-primary);
    margin-bottom: 0.5rem;
    line-height: 1.4;
 }
 .video-stats {
    display: flex;
    gap: 1rem;
    font-size: 0.75rem;
    color: var(--text-secondary);
 }
 .video-stats span {
    display: flex;
    align-items: center;
    gap: 0.25rem;
 }
 .video-stats svg {
    flex-shrink: 0;
 }
 /* Session Complete */
 .session-complete {
    text-align: center;
    padding: 3rem 2rem;
 }
 .complete-icon {
    display: flex;
    justify-content: center;
    margin-bottom: 1rem;
    color: var(--success-color);
 }
 .session-complete h3 {
    font-size: 1.5rem;
    font-weight: 600;
    color: var(--text-primary);
    margin-bottom: 0.5rem;
 }
 .session-complete p {
    color: var(--text-secondary);
    margin-bottom: 1.5rem;
 }
 /* Video Grid */
 .video-grid {
    display: grid;
    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
    gap: 1rem;
 }
 .video-grid .video-card {
    font-size: 0.875rem;
 }
 .video-grid .video-screenshot,
 .video-grid .video-placeholder {
    aspect-ratio: 9/16;
 }
 /* History Section */
 .history-section {
    margin-top: 2rem;
 }
 .history-section h2 {
    font-size: 1.25rem;
    font-weight: 600;
    color: var(--text-primary);
    margin-bottom: 1rem;
 }
 /* Responsive */
@media (max-width: 768px) {
    .form-row {
        grid-template-columns: 1fr;
    }
    .session-header {
        flex-direction: column;
        gap: 1rem;
        align-items: flex-start;
    }
    .video-grid {
        grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
    }
 }
--- a/dashboard/static/index.html
+++ b/dashboard/static/index.html
@@ -41,6 +41,13 @@
                </div>
            </div>
            <div class="header-actions">
                <a href="/static/video-learning.html" class="btn btn-primary">
                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                        <polygon points="23 7 16 12 23 17 23 7"></polygon>
                        <rect x="1" y="5" width="15" height="14" rx="2" ry="2"></rect>
                    </svg>
                    Video Learning
                </a>
                <button @click="refreshDevices" class="btn btn-secondary" :disabled="refreshing">
                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" :class="{ spinning: refreshing }">
                        <polyline points="23 4 23 10 17 10"></polyline>
--- a/dashboard/static/js/video-learning.js
+++ b/dashboard/static/js/video-learning.js
@@ -0,0 +1,200 @@
 /**
 * Video Learning Module for AutoGLM Dashboard
 *
 * This module provides UI and functionality for the Video Learning Agent,
 * allowing users to watch and learn from short video platforms.
 */
 const VideoLearningModule = {
    // Current session state
    currentSessionId: null,
    currentSessionStatus: null,
    videos: [],
    isPolling: false,
    // Create a new learning session
    async createSession(deviceId, options = {}) {
        const {
            platform = 'douyin',
            targetCount = 10,
            category = null,
            watchDuration = 3.0,
        } = options;
        try {
            const response = await axios.post('/api/video-learning/sessions', {
                device_id: deviceId,
                platform: platform,
                target_count: targetCount,
                category: category,
                watch_duration: watchDuration,
            });
            this.currentSessionId = response.data.session_id;
            this.startPolling();
            return response.data;
        } catch (error) {
            console.error('Error creating session:', error);
            throw error;
        }
    },
    // Start a session
    async startSession(sessionId) {
        try {
            const response = await axios.post(`/api/video-learning/sessions/${sessionId}/start`);
            return response.data;
        } catch (error) {
            console.error('Error starting session:', error);
            throw error;
        }
    },
    // Control a session (pause/resume/stop)
    async controlSession(sessionId, action) {
        try {
            const response = await axios.post(`/api/video-learning/sessions/${sessionId}/control`, {
                action: action,
            });
            return response.data;
        } catch (error) {
            console.error('Error controlling session:', error);
            throw error;
        }
    },
    // Get session status
    async getSessionStatus(sessionId) {
        try {
            const response = await axios.get(`/api/video-learning/sessions/${sessionId}/status`);
            this.currentSessionStatus = response.data;
            return response.data;
        } catch (error) {
            console.error('Error getting session status:', error);
            throw error;
        }
    },
    // Get session videos
    async getSessionVideos(sessionId) {
        try {
            const response = await axios.get(`/api/video-learning/sessions/${sessionId}/videos`);
            this.videos = response.data;
            return response.data;
        } catch (error) {
            console.error('Error getting session videos:', error);
            throw error;
        }
    },
    // List all active sessions
    async listSessions() {
        try {
            const response = await axios.get('/api/video-learning/sessions');
            return response.data;
        } catch (error) {
            console.error('Error listing sessions:', error);
            throw error;
        }
    },
    // Delete a session
    async deleteSession(sessionId) {
        try {
            const response = await axios.delete(`/api/video-learning/sessions/${sessionId}`);
            if (this.currentSessionId === sessionId) {
                this.currentSessionId = null;
                this.currentSessionStatus = null;
                this.stopPolling();
            }
            return response.data;
        } catch (error) {
            console.error('Error deleting session:', error);
            throw error;
        }
    },
    // Start polling for session updates
    startPolling(intervalMs = 1000) {
        if (this.isPolling) return;
        this.isPolling = true;
        this.pollInterval = setInterval(async () => {
            if (this.currentSessionId) {
                try {
                    await this.getSessionStatus(this.currentSessionId);
                    await this.getSessionVideos(this.currentSessionId);
                    // Trigger custom event for UI updates
                    window.dispatchEvent(new CustomEvent('videoLearningUpdate', {
                        detail: {
                            sessionId: this.currentSessionId,
                            status: this.currentSessionStatus,
                            videos: this.videos,
                        }
                    }));
                    // Stop polling if session is complete, but do one final update
                    if (this.currentSessionStatus && !this.currentSessionStatus.is_active) {
                        console.log('[VideoLearning] Session completed, doing final update...');
                        // Do one final update to ensure we have the latest data
                        await this.getSessionStatus(this.currentSessionId);
                        await this.getSessionVideos(this.currentSessionId);
                        window.dispatchEvent(new CustomEvent('videoLearningUpdate', {
                            detail: {
                                sessionId: this.currentSessionId,
                                status: this.currentSessionStatus,
                                videos: this.videos,
                            }
                        }));
                        console.log('[VideoLearning] Final update complete, stopping poll');
                        this.stopPolling();
                    }
                } catch (error) {
                    console.error('Error polling session status:', error);
                    // Don't stop polling on error, just log it
                }
            }
        }, intervalMs);
        console.log(`[VideoLearning] Started polling with ${intervalMs}ms interval`);
    },
    // Stop polling
    stopPolling() {
        if (this.pollInterval) {
            clearInterval(this.pollInterval);
            this.pollInterval = null;
            console.log('[VideoLearning] Stopped polling');
        }
        this.isPolling = false;
    },
    // Format duration
    formatDuration(seconds) {
        if (seconds < 60) {
            return `${seconds.toFixed(1)}s`;
        }
        const minutes = Math.floor(seconds / 60);
        const remainingSeconds = seconds % 60;
        return `${minutes}m ${remainingSeconds.toFixed(1)}s`;
    },
    // Format number with K/M suffix
    formatNumber(num) {
        if (num === null || num === undefined) return 'N/A';
        if (num >= 1000000) {
            return `${(num / 1000000).toFixed(1)}M`;
        } else if (num >= 1000) {
            return `${(num / 1000).toFixed(1)}K`;
        }
        return num.toString();
    },
 };
 // Export for use in other modules
 if (typeof module !== 'undefined' && module.exports) {
    module.exports = VideoLearningModule;
 }
--- a/dashboard/static/video-learning.html
+++ b/dashboard/static/video-learning.html
@@ -0,0 +1,412 @@
 <!DOCTYPE html>
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Video Learning - AutoGLM Dashboard</title>
    <!-- Vue.js 3 -->
    <script src="https://unpkg.com/vue@3/dist/vue.global.js"></script>
    <!-- Axios for API requests -->
    <script src="https://unpkg.com/axios/dist/axios.min.js"></script>
    <!-- CSS -->
    <link rel="stylesheet" href="/static/css/dashboard.css">
    <link rel="stylesheet" href="/static/css/video-learning.css">
 </head>
 <body>
    <div id="app">
        <!-- Header -->
        <header class="header">
            <div class="header-content">
                <h1>
                    <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                        <polygon points="23 7 16 12 23 17 23 7"></polygon>
                        <rect x="1" y="5" width="15" height="14" rx="2" ry="2"></rect>
                    </svg>
                    Video Learning Agent
                </h1>
                <div class="stats">
                    <span class="stat" title="Session Status">
                        <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                            <circle cx="12" cy="12" r="10"></circle>
                            <polyline points="12 6 12 12 16 14"></polyline>
                        </svg>
                        {{ sessionStatus ? sessionStatus.status : 'No Session' }}
                    </span>
                    <span class="stat" v-if="sessionStatus" title="Progress">
                        <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                            <path d="M22 11.08V12a10 10 0 1 1-5.93-9.14"></path>
                            <polyline points="22 4 12 14.01 9 11.01"></polyline>
                        </svg>
                        {{ sessionStatus.watched_count }} / {{ sessionStatus.target_count }}
                    </span>
                </div>
            </div>
            <div class="header-actions">
                <button @click="goBack" class="btn btn-secondary">
                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                        <line x1="19" y1="12" x2="5" y2="12"></line>
                        <polyline points="12 19 5 12 12 5"></polyline>
                    </svg>
                    Back
                </button>
            </div>
        </header>
        <!-- Main Content -->
        <main class="main-content">
            <!-- Configuration Section -->
            <section class="config-section" v-if="!currentSessionId">
                <h2>Create Learning Session</h2>
                <div class="config-form">
                    <div class="form-group">
                        <label>Device</label>
                        <select v-model="config.deviceId" :disabled="loading">
                            <option value="">Select a device...</option>
                            <option v-for="device in devices" :key="device.device_id" :value="device.device_id"
                                    :disabled="!device.is_connected || device.status === 'busy'">
                                {{ device.device_id }}
                                {{ !device.is_connected ? '(Disconnected)' : '' }}
                                {{ device.status === 'busy' ? '(Busy)' : '' }}
                            </option>
                        </select>
                    </div>
                    <div class="form-group">
                        <label>Platform</label>
                        <select v-model="config.platform" :disabled="loading">
                            <option value="douyin">Douyin (抖音)</option>
                            <option value="kuaishou">Kuaishou (快手)</option>
                            <option value="tiktok">TikTok</option>
                        </select>
                    </div>
                    <div class="form-row">
                        <div class="form-group">
                            <label>Target Videos</label>
                            <input type="number" v-model.number="config.targetCount" min="1" max="100" :disabled="loading">
                        </div>
                        <div class="form-group">
                            <label>Watch Duration (s)</label>
                            <input type="number" v-model.number="config.watchDuration" min="1" max="30" step="0.5" :disabled="loading">
                        </div>
                    </div>
                    <div class="form-group">
                        <label>Category (Optional)</label>
                        <input type="text" v-model="config.category" placeholder="e.g., 美食, 旅行, 搞笑" :disabled="loading">
                        <small>Leave empty to watch recommended videos</small>
                    </div>
                    <button @click="createAndStartSession" class="btn btn-primary" :disabled="loading || !config.deviceId">
                        <svg v-if="loading" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="spinning">
                            <path d="M21 12a9 9 0 1 1-6.219-8.56"></path>
                        </svg>
                        {{ loading ? 'Creating...' : 'Start Learning' }}
                    </button>
                </div>
            </section>
            <!-- Session Control Section -->
            <section class="session-section" v-if="currentSessionId && sessionStatus">
                <div class="session-header">
                    <h2>Session: {{ currentSessionId }}</h2>
                    <div class="session-controls">
                        <button v-if="sessionStatus.is_paused" @click="resumeSession" class="btn btn-primary btn-sm">
                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                                <polygon points="5 3 19 12 5 21 5 3"></polygon>
                            </svg>
                            Resume
                        </button>
                        <button v-else-if="sessionStatus.is_active" @click="pauseSession" class="btn btn-secondary btn-sm">
                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                                <rect x="6" y="4" width="4" height="16"></rect>
                                <rect x="14" y="4" width="4" height="16"></rect>
                            </svg>
                            Pause
                        </button>
                        <button v-if="sessionStatus.is_active" @click="stopSession" class="btn btn-danger btn-sm">
                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                                <rect x="6" y="6" width="12" height="12"></rect>
                            </svg>
                            Stop
                        </button>
                    </div>
                </div>
                <!-- Progress Bar -->
                <div class="progress-section" v-if="sessionStatus.is_active || sessionStatus.is_paused">
                    <div class="progress-info">
                        <span>Progress: {{ sessionStatus.watched_count }} / {{ sessionStatus.target_count }}</span>
                        <span>{{ Math.round(sessionStatus.progress_percent) }}%</span>
                    </div>
                    <div class="progress-bar-large">
                        <div class="progress-fill" :style="{ width: sessionStatus.progress_percent + '%' }"></div>
                    </div>
                    <div class="progress-stats">
                        <span>Total Duration: {{ formatDuration(sessionStatus.total_duration) }}</span>
                    </div>
                </div>
                <!-- Current Video -->
                <div class="current-video" v-if="sessionStatus.current_video">
                    <h3>Current Video</h3>
                    <div class="video-card">
                        <div class="video-screenshot" v-if="sessionStatus.current_video.screenshot_path">
                            <img :src="sessionStatus.current_video.screenshot_path" alt="Current video">
                        </div>
                        <div class="video-info">
                            <div class="video-id">#{{ sessionStatus.current_video.sequence_id }}</div>
                            <div class="video-description" v-if="sessionStatus.current_video.description">
                                {{ sessionStatus.current_video.description }}
                            </div>
                            <div class="video-stats">
                                <span v-if="sessionStatus.current_video.likes">
                                    <svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor" stroke="none">
                                        <path d="M20.84 4.61a5.5 5.5 0 0 0-7.78 0L12 5.67l-1.06-1.06a5.5 5.5 0 0 0-7.78 7.78l1.06 1.06L12 21.23l7.78-7.78 1.06-1.06a5.5 5.5 0 0 0 0-7.78z"></path>
                                    </svg>
                                    {{ formatNumber(sessionStatus.current_video.likes) }}
                                </span>
                                <span v-if="sessionStatus.current_video.comments">
                                    <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                                        <path d="M21 11.5a8.38 8.38 0 0 1-.9 3.8 8.5 8.5 0 0 1-7.6 4.7 8.38 8.38 0 0 1-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 0 1-.9-3.8 8.5 8.5 0 0 1 4.7-7.6 8.38 8.38 0 0 1 3.8-.9h.5a8.48 8.48 0 0 1 8 8v.5z"></path>
                                    </svg>
                                    {{ formatNumber(sessionStatus.current_video.comments) }}
                                </span>
                            </div>
                        </div>
                    </div>
                </div>
                <!-- Session Complete -->
                <div class="session-complete" v-if="!sessionStatus.is_active && currentSessionId">
                    <div class="complete-icon">
                        <svg width="64" height="64" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                            <path d="M22 11.08V12a10 10 0 1 1-5.93-9.14"></path>
                            <polyline points="22 4 12 14.01 9 11.01"></polyline>
                        </svg>
                    </div>
                    <h3>Session Complete!</h3>
                    <p>Watched {{ sessionStatus.watched_count }} videos in {{ formatDuration(sessionStatus.total_duration) }}</p>
                    <button @click="resetSession" class="btn btn-primary">Start New Session</button>
                </div>
            </section>
            <!-- Video History -->
            <section class="history-section" v-if="videos.length > 0">
                <h2>Watched Videos</h2>
                <div class="video-grid">
                    <div v-for="video in videos" :key="video.sequence_id" class="video-card">
                        <div class="video-screenshot" v-if="video.screenshot_path">
                            <img :src="video.screenshot_path" :alt="'Video ' + video.sequence_id">
                        </div>
                        <div class="video-placeholder" v-else>
                            <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                                <rect x="2" y="2" width="20" height="20" rx="2.18" ry="2.18"></rect>
                                <line x1="7" y1="2" x2="7" y2="22"></line>
                                <line x1="17" y1="2" x2="17" y2="22"></line>
                                <line x1="2" y1="12" x2="22" y2="12"></line>
                                <line x1="2" y1="7" x2="7" y2="7"></line>
                                <line x1="2" y1="17" x2="7" y2="17"></line>
                                <line x1="17" y1="17" x2="22" y2="17"></line>
                                <line x1="17" y1="7" x2="22" y2="7"></line>
                            </svg>
                        </div>
                        <div class="video-info">
                            <div class="video-id">#{{ video.sequence_id }}</div>
                            <div class="video-description" v-if="video.description">{{ video.description }}</div>
                            <div class="video-stats">
                                <span v-if="video.likes">
                                    <svg width="12" height="12" viewBox="0 0 24 24" fill="currentColor" stroke="none">
                                        <path d="M20.84 4.61a5.5 5.5 0 0 0-7.78 0L12 5.67l-1.06-1.06a5.5 5.5 0 0 0-7.78 7.78l1.06 1.06L12 21.23l7.78-7.78 1.06-1.06a5.5 5.5 0 0 0 0-7.78z"></path>
                                    </svg>
                                    {{ formatNumber(video.likes) }}
                                </span>
                                <span v-if="video.comments">
                                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
                                        <path d="M21 11.5a8.38 8.38 0 0 1-.9 3.8 8.5 8.5 0 0 1-7.6 4.7 8.38 8.38 0 0 1-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 0 1-.9-3.8 8.5 8.5 0 0 1 4.7-7.6 8.38 8.38 0 0 1 3.8-.9h.5a8.48 8.48 0 0 1 8 8v.5z"></path>
                                    </svg>
                                    {{ formatNumber(video.comments) }}
                                </span>
                            </div>
                        </div>
                    </div>
                </div>
            </section>
        </main>
        <!-- Toast notifications -->
        <div class="toast-container">
            <div v-for="toast in toasts" :key="toast.id" class="toast" :class="toast.type">
                {{ toast.message }}
            </div>
        </div>
    </div>
    <script src="/static/js/video-learning.js"></script>
    <script>
        const { createApp } = Vue;
        createApp({
            data() {
                return {
                    devices: [],
                    currentSessionId: null,
                    sessionStatus: null,
                    videos: [],
                    loading: false,
                    toasts: [],
                    toastIdCounter: 0,
                    config: {
                        deviceId: '',
                        platform: 'douyin',
                        targetCount: 10,
                        category: '',
                        watchDuration: 3.0,
                    },
                };
            },
            mounted() {
                this.loadDevices();
                this.setupVideoLearningEvents();
            },
            methods: {
                async loadDevices() {
                    try {
                        const response = await axios.get('/api/devices');
                        this.devices = response.data;
                    } catch (error) {
                        this.showToast('Failed to load devices', 'error');
                    }
                },
                async createAndStartSession() {
                    if (!this.config.deviceId) {
                        this.showToast('Please select a device', 'error');
                        return;
                    }
                    this.loading = true;
                    try {
                        // Create session
                        const createResult = await VideoLearningModule.createSession(
                            this.config.deviceId,
                            {
                                platform: this.config.platform,
                                targetCount: this.config.targetCount,
                                category: this.config.category || null,
                                watchDuration: this.config.watchDuration,
                            }
                        );
                        this.currentSessionId = createResult.session_id;
                        this.showToast('Session created! Starting...', 'success');
                        // Start session
                        await VideoLearningModule.startSession(this.currentSessionId);
                        this.showToast('Learning session started!', 'success');
                        // Initial status update
                        await this.updateSessionStatus();
                    } catch (error) {
                        this.showToast('Failed to create session: ' + error.message, 'error');
                    } finally {
                        this.loading = false;
                    }
                },
                async pauseSession() {
                    if (!this.currentSessionId) return;
                    try {
                        await VideoLearningModule.controlSession(this.currentSessionId, 'pause');
                        await this.updateSessionStatus();
                        this.showToast('Session paused', 'info');
                    } catch (error) {
                        this.showToast('Failed to pause session', 'error');
                    }
                },
                async resumeSession() {
                    if (!this.currentSessionId) return;
                    try {
                        await VideoLearningModule.controlSession(this.currentSessionId, 'resume');
                        await this.updateSessionStatus();
                        this.showToast('Session resumed', 'info');
                    } catch (error) {
                        this.showToast('Failed to resume session', 'error');
                    }
                },
                async stopSession() {
                    if (!this.currentSessionId) return;
                    if (!confirm('Are you sure you want to stop this session?')) return;
                    try {
                        await VideoLearningModule.controlSession(this.currentSessionId, 'stop');
                        await this.updateSessionStatus();
                        this.showToast('Session stopped', 'info');
                    } catch (error) {
                        this.showToast('Failed to stop session', 'error');
                    }
                },
                async updateSessionStatus() {
                    if (!this.currentSessionId) return;
                    try {
                        this.sessionStatus = await VideoLearningModule.getSessionStatus(this.currentSessionId);
                        this.videos = await VideoLearningModule.getSessionVideos(this.currentSessionId);
                    } catch (error) {
                        console.error('Error updating session status:', error);
                    }
                },
                setupVideoLearningEvents() {
                    window.addEventListener('videoLearningUpdate', (event) => {
                        const { status, videos } = event.detail;
                        this.sessionStatus = status;
                        this.videos = videos;
                    });
                },
                resetSession() {
                    this.currentSessionId = null;
                    this.sessionStatus = null;
                    this.videos = [];
                    VideoLearningModule.stopPolling();
                },
                goBack() {
                    window.location.href = '/';
                },
                formatDuration(seconds) {
                    return VideoLearningModule.formatDuration(seconds);
                },
                formatNumber(num) {
                    return VideoLearningModule.formatNumber(num);
                },
                showToast(message, type = 'info') {
                    const id = this.toastIdCounter++;
                    this.toasts.push({ id, message, type });
                    setTimeout(() => {
                        this.toasts = this.toasts.filter(t => t.id !== id);
                    }, 3000);
                },
            },
            beforeUnmount() {
                VideoLearningModule.stopPolling();
            },
        }).mount('#app');
    </script>
 </body>
 </html>
--- a/docs/VIDEO_LEARNING.md
+++ b/docs/VIDEO_LEARNING.md
@@ -0,0 +1,253 @@
 # Video Learning Agent
 AI-powered agent for learning from short video platforms like Douyin (抖音), Kuaishou (快手), and TikTok.
 ## 功能特性
 ### MVP 功能
 - **自动滑动**: 自动在视频之间滑动切换
 - **播放控制**: 播放/暂停控制
 - **截图记录**: 为每个视频截图保存
 - **数据采集**: 采集视频描述、点赞数、评论数
 - **可视化管理**: 通过 Web Dashboard 可视化控制
 - **会话管理**: 创建、暂停、恢复、停止学习会话
 - **数据导出**: 导出学习数据（JSON/CSV）
 ## 快速开始
 ### 1. 启动 Dashboard
 ```bash
 # 使用脚本启动（推荐）
 python scripts/run_video_learning_demo.bat   # Windows
 bash scripts/run_video_learning_demo.sh      # Linux/Mac
 # 或手动启动
 python -m uvicorn dashboard.main:app --host 0.0.0.0 --port 8080 --reload
 ```
 ### 2. 访问 Video Learning 页面
 打开浏览器访问: `http://localhost:8080/static/video-learning.html`
 或从主 Dashboard 页面点击 "Video Learning" 按钮。
 ### 3. 创建学习会话
 1. 选择设备
 2. 选择平台（抖音/快手/TikTok）
 3. 设置目标视频数量
 4. （可选）设置类别筛选
 5. 设置观看时长
 6. 点击 "Start Learning"
 ## 使用示例
 ### 独立运行
 ```bash
 python examples/video_learning_demo.py \
    --device-id emulator-5554 \
    --count 10 \
    --category "美食" \
    --watch-duration 3.0
 ```
 ### 通过 Dashboard
 1. 打开 Video Learning 页面
 2. 配置学习参数
 3. 点击启动
 4. 实时查看进度
 ### API 调用
 ```python
 from phone_agent import VideoLearningAgent
 from phone_agent.model.client import ModelConfig
 # 创建模型配置
 model_config = ModelConfig(
    base_url="https://open.bigmodel.cn/api/paas/v4",
    model_name="autoglm-phone-9b",
    api_key="your-api-key",
 )
 # 创建 Video Learning Agent
 agent = VideoLearningAgent(
    model_config=model_config,
    platform="douyin",
    output_dir="./video_learning_data",
 )
 # 启动会话
 session_id = agent.start_session(
    device_id="emulator-5554",
    target_count=10,
    category="美食",
    watch_duration=3.0,
 )
 # 运行任务
 task = """
 在抖音上学习"美食"类视频：
 1. 打开抖音并搜索"美食"
 2. 观看视频，每个视频约3秒
 3. 记录描述、点赞数、评论数
 4. 滑动到下一个视频
 5. 重复直到观看完10个视频
 """
 success = agent.run_learning_task(task)
 # 导出数据
 agent.export_data("json")
 agent.export_data("csv")
 ```
 ## API 端点
 ### 创建会话
 ```http
 POST /api/video-learning/sessions
 Content-Type: application/json
 {
    "device_id": "emulator-5554",
    "platform": "douyin",
    "target_count": 10,
    "category": "美食",
    "watch_duration": 3.0
 }
 ```
 ### 启动会话
 ```http
 POST /api/video-learning/sessions/{session_id}/start
 ```
 ### 控制会话
 ```http
 POST /api/video-learning/sessions/{session_id}/control
 Content-Type: application/json
 {
    "action": "pause"  // pause, resume, stop
 }
 ```
 ### 获取会话状态
 ```http
 GET /api/video-learning/sessions/{session_id}/status
 ```
 ### 获取会话视频列表
 ```http
 GET /api/video-learning/sessions/{session_id}/videos
 ```
 ## 数据结构
 ### VideoRecord
 ```python
 {
    "sequence_id": 1,
    "timestamp": "2024-01-09T10:00:00",
    "screenshot_path": "./video_learning_data/screenshots/...",
    "watch_duration": 3.0,
    "description": "视频描述文案",
    "likes": 1000,
    "comments": 50,
    "tags": [],
    "category": "美食"
 }
 ```
 ### LearningSession
 ```python
 {
    "session_id": "session_20240109_100000",
    "start_time": "2024-01-09T10:00:00",
    "platform": "douyin",
    "target_category": "美食",
    "target_count": 10,
    "is_active": true,
    "is_paused": false,
    "total_videos": 10,
    "total_duration": 30.0,
    "records": [...]
 }
 ```
 ## 配置选项
 在 `.env` 文件中配置：
 ```bash
 # 视频学习数据输出目录
 VIDEO_LEARNING_OUTPUT_DIR=./video_learning_data
 # 模型参数
 PHONE_AGENT_MAX_TOKENS=3000
 PHONE_AGENT_TEMPERATURE=0.0
 PHONE_AGENT_TOP_P=0.85
 PHONE_AGENT_FREQUENCY_PENALTY=0.2
 ```
 ## 后续扩展计划
 ### 阶段 2: 高级分析
 - [ ] 视频内容特征提取
 - [ ] 常见元素识别
 - [ ] 视频风格分析
 - [ ] BGM 识别
 ### 阶段 3: 模式学习
 - [ ] 同类视频模式归纳
 - [ ] 创作趋势分析
 - [ ] 热门元素统计
 - [ ] 最佳实践总结
 ### 阶段 4: 创作辅助
 - [ ] 脚本生成
 - [ ] 分镜头建议
 - [ ] 拍摄指导
 - [ ] 剪辑建议
 ## 技术架构
 ```
 VideoLearningAgent
 ├── ModelConfig (VLM 配置)
 ├── LearningSession (会话管理)
 │   └── VideoRecord[] (视频记录)
 ├── Callbacks (回调函数)
 │   ├── on_video_watched
 │   ├── on_progress_update
 │   └── on_session_complete
 └── PhoneAgent (底层操作)
    ├── 视觉理解 (VLM)
    ├── 设备控制 (ADB/HDC/iOS)
    └── 任务执行
 ```
 ## 故障排除
 ### 问题: 设备未连接
 - 确保 ADB/HDC 服务正在运行
 - 检查设备是否通过 USB 连接
 - 尝试点击 "Refresh" 按钮
 ### 问题: 任务无法启动
 - 检查模型 API 配置
 - 确保 `.env` 文件正确配置
 - 查看 Dashboard 控制台日志
 ### 问题: 视频信息未采集
 - 确保 VLM 模型正常工作
 - 检查网络连接
 - 增加观看时长
 ## 许可证
 MIT License
--- a/examples/video_learning_demo.py
+++ b/examples/video_learning_demo.py
@@ -0,0 +1,161 @@
 """
 Video Learning Agent Demo
 This script demonstrates how to use the VideoLearningAgent to watch
 and learn from short video platforms like Douyin.
 Usage:
    python examples/video_learning_demo.py --device-id <device_id> --count 10
 """
 import os
 import sys
 from pathlib import Path
 # Add parent directory to path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from phone_agent.model.client import ModelConfig
 from phone_agent.video_learning import VideoLearningAgent
 def main():
    """Main demo function."""
    # Load configuration from environment
    base_url = os.getenv("MODEL_BASE_URL", "http://localhost:8000/v1")
    api_key = os.getenv("MODEL_API_KEY", "your-api-key")
    model_name = os.getenv("MODEL_NAME", "autoglm-phone-9b")
    # Configuration
    device_id = os.getenv("DEVICE_ID", "emulator-5554")
    target_count = int(os.getenv("TARGET_COUNT", "10"))
    watch_duration = float(os.getenv("WATCH_DURATION", "3.0"))
    category = os.getenv("CATEGORY", None)  # e.g., "美食", "旅行", "搞笑"
    print("=" * 60)
    print("Video Learning Agent Demo")
    print("=" * 60)
    print(f"Device: {device_id}")
    print(f"Platform: Douyin")
    print(f"Target videos: {target_count}")
    print(f"Watch duration: {watch_duration}s per video")
    if category:
        print(f"Category filter: {category}")
    print("=" * 60)
    # Create agent
    model_config = ModelConfig(
        base_url=base_url,
        model_name=model_name,
        api_key=api_key,
        lang="cn",
    )
    agent = VideoLearningAgent(
        model_config=model_config,
        platform="douyin",
        output_dir="./video_learning_data",
    )
    # Setup callbacks
    def on_video_watched(record):
        print(f"\n[Video {record.sequence_id}] Watched!")
        if record.description:
            print(f"  Description: {record.description}")
        if record.likes:
            print(f"  Likes: {record.likes}")
        print(f"  Screenshot: {record.screenshot_path}")
    def on_progress_update(current, total):
        percent = (current / total * 100) if total > 0 else 0
        print(f"\nProgress: {current}/{total} ({percent:.1f}%)")
    def on_session_complete(session):
        print("\n" + "=" * 60)
        print("Session Complete!")
        print("=" * 60)
        print(f"Total videos watched: {session.total_videos}")
        print(f"Total duration: {session.total_duration:.1f}s")
        print(f"Data saved to: ./video_learning_data/{session.session_id}.json")
    agent.on_video_watched = on_video_watched
    agent.on_progress_update = on_progress_update
    agent.on_session_complete = on_session_complete
    # Start session
    session_id = agent.start_session(
        device_id=device_id,
        target_count=target_count,
        category=category,
        watch_duration=watch_duration,
    )
    print(f"\nSession started: {session_id}")
    print("Starting video watching task...\n")
    # Construct the task
    if category:
        task = f"""
 请帮我学习抖音上的"{category}"类视频。具体任务如下：
 1. 打开抖音应用
 2. 搜索"{category}"
 3. 开始观看视频，每个视频观看约{watch_duration}秒
 4. 记录每个视频的描述、点赞数、评论数等信息
 5. 滑动到下一个视频
 6. 重复步骤3-5，直到观看完{target_count}个视频
 请按照以下格式记录每个视频：
 - 视频序号
 - 描述文案（屏幕上的文字）
 - 点赞数（如果有显示）
 - 评论数（如果有显示）
 - 截图
 每个视频观看时，请等待{watch_duration}秒后再滑动到下一个。
        """
    else:
        task = f"""
 请帮我学习抖音上的推荐视频。具体任务如下：
 1. 打开抖音应用
 2. 在推荐页开始观看视频，每个视频观看约{watch_duration}秒
 3. 记录每个视频的描述、点赞数、评论数等信息
 4. 向上滑动到下一个视频
 5. 重复步骤3-4，直到观看完{target_count}个视频
 请按照以下格式记录每个视频：
 - 视频序号
 - 描述文案（屏幕上的文字）
 - 点赞数（如果有显示）
 - 评论数（如果有显示）
 - 截图
 每个视频观看时，请等待{watch_duration}秒后再滑动到下一个。
        """
    # Run the task
    success = agent.run_learning_task(task)
    if success:
        print("\n✓ Learning task completed successfully!")
        # Export data
        json_file = agent.export_data("json")
        print(f"✓ Data exported to: {json_file}")
        csv_file = agent.export_data("csv")
        print(f"✓ Data exported to: {csv_file}")
    else:
        print("\n✗ Learning task failed")
    print("\nSession progress:")
    progress = agent.get_session_progress()
    for key, value in progress.items():
        print(f"  {key}: {value}")
 if __name__ == "__main__":
    main()
--- a/phone_agent/init.py
+++ b/phone_agent/init.py
@@ -7,6 +7,7 @@ using AI models for visual understanding and decision making.
 from phone_agent.agent import AgentConfig, PhoneAgent, StepResult
 from phone_agent.agent_ios import IOSAgentConfig, IOSPhoneAgent
 from phone_agent.video_learning import VideoLearningAgent, VideoRecord, LearningSession
 __version__ = "0.1.0"
 __all__ = [
@@ -15,4 +16,7 @@ __all__ = [
    "AgentConfig",
    "IOSAgentConfig",
    "StepResult",
    "VideoLearningAgent",
    "VideoRecord",
    "LearningSession",
 ]
--- a/phone_agent/video_learning.py
+++ b/phone_agent/video_learning.py
@@ -0,0 +1,561 @@
 """
 Video Learning Agent for AutoGLM
 This agent learns from short video platforms (like Douyin/TikTok)
 by watching videos and collecting information.
 MVP Features:
 - Automatic video scrolling
 - Play/Pause control
 - Screenshot capture for each video
 - Basic data collection (likes, comments, etc.)
 """
 import hashlib
 import json
 import os
 from dataclasses import dataclass, field
 from datetime import datetime
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Any
 from phone_agent import PhoneAgent, AgentConfig
 from phone_agent.agent import StepResult
 from phone_agent.model.client import ModelConfig
 from phone_agent.device_factory import get_device_factory
@dataclass
 class VideoRecord:
    """Record of a watched video."""
    sequence_id: int
    timestamp: str
    screenshot_path: Optional[str] = None
    watch_duration: float = 0.0  # seconds
    # Basic info (extracted via OCR/analysis)
    description: Optional[str] = None  # Video caption/text
    likes: Optional[int] = None
    comments: Optional[int] = None
    shares: Optional[int] = None
    # Content analysis (for future expansion)
    tags: List[str] = field(default_factory=list)
    category: Optional[str] = None
    elements: List[str] = field(default_factory=list)
    # Metadata
    position_in_session: int = 0
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            "sequence_id": self.sequence_id,
            "timestamp": self.timestamp,
            "screenshot_path": self.screenshot_path,
            "watch_duration": self.watch_duration,
            "description": self.description,
            "likes": self.likes,
            "comments": self.comments,
            "shares": self.shares,
            "tags": self.tags,
            "category": self.category,
            "elements": self.elements,
            "position_in_session": self.position_in_session,
        }
@dataclass
 class LearningSession:
    """A learning session with multiple videos."""
    session_id: str
    start_time: str
    platform: str  # "douyin", "tiktok", etc.
    target_category: Optional[str] = None
    target_count: int = 10
    records: List[VideoRecord] = field(default_factory=list)
    # Control flags
    is_active: bool = True
    is_paused: bool = False
    # Statistics
    total_videos: int = 0
    total_duration: float = 0.0
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            "session_id": self.session_id,
            "start_time": self.start_time,
            "platform": self.platform,
            "target_category": self.target_category,
            "target_count": self.target_count,
            "is_active": self.is_active,
            "is_paused": self.is_paused,
            "total_videos": self.total_videos,
            "total_duration": self.total_duration,
            "records": [r.to_dict() for r in self.records],
        }
 class VideoLearningAgent:
    """
    Agent for learning from short video platforms.
    MVP Capabilities:
    - Navigate to video platform
    - Watch videos automatically
    - Capture screenshots
    - Collect basic information
    - Export learning data
    """
    # Platform-specific configurations
    PLATFORM_CONFIGS = {
        "douyin": {
            "package_name": "com.ss.android.ugc.aweme",
            "activity_hint": "aweme",
            "scroll_gesture": "up",
            "like_position": {"x": 0.9, "y": 0.8},  # Relative coordinates
            "comment_position": {"x": 0.9, "y": 0.7},
        },
        "kuaishou": {
            "package_name": "com.smile.gifmaker",
            "activity_hint": "gifmaker",
            "scroll_gesture": "up",
            "like_position": {"x": 0.9, "y": 0.8},
        },
        "tiktok": {
            "package_name": "com.zhiliaoapp.musically",
            "activity_hint": "musically",
            "scroll_gesture": "up",
            "like_position": {"x": 0.9, "y": 0.8},
        },
    }
    def __init__(
        self,
        model_config: ModelConfig,
        platform: str = "douyin",
        output_dir: str = "./video_learning_data",
    ):
        """
        Initialize Video Learning Agent.
        Args:
            model_config: Model configuration for VLM
            platform: Platform name (douyin, kuaishou, tiktok)
            output_dir: Directory to save screenshots and data
        """
        self.model_config = model_config
        self.platform = platform
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        # Create screenshots subdirectory
        self.screenshot_dir = self.output_dir / "screenshots"
        self.screenshot_dir.mkdir(exist_ok=True)
        # Current session
        self.current_session: Optional[LearningSession] = None
        self.video_counter = 0
        # Agent will be created when starting a session
        self.agent: Optional[PhoneAgent] = None
        # Callbacks for external control
        self.on_video_watched: Optional[Callable[[VideoRecord], None]] = None
        self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
        self.on_progress_update: Optional[Callable[[int, int], None]] = None
        # Video detection: track screenshot changes (simplified)
        self._last_screenshot_hash: Optional[str] = None
    def start_session(
        self,
        device_id: str,
        target_count: int = 10,
        category: Optional[str] = None,
        watch_duration: float = 3.0,
        max_steps: int = 500,
    ) -> str:
        """
        Start a learning session.
        Args:
            device_id: Target device ID
            target_count: Number of videos to watch
            category: Target category (e.g., "美食", "旅行")
            watch_duration: How long to watch each video (seconds)
            max_steps: Maximum execution steps
        Returns:
            Session ID
        """
        # Create new session
        session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.current_session = LearningSession(
            session_id=session_id,
            start_time=datetime.now().isoformat(),
            platform=self.platform,
            target_category=category,
            target_count=target_count,
        )
        # Configure agent with callbacks
        agent_config = AgentConfig(
            device_id=device_id,
            max_steps=max_steps,
            lang="cn",
            step_callback=self._on_step,
            before_action_callback=self._before_action,
        )
        # Create phone agent
        self.agent = PhoneAgent(
            model_config=self.model_config,
            agent_config=agent_config,
        )
        # Store parameters for the task
        self._watch_duration = watch_duration
        self._device_id = device_id
        # Reset video detection tracking (simplified)
        self._last_screenshot_hash = None
        self.video_counter = 0
        return session_id
    def run_learning_task(self, task: str) -> bool:
        """
        Run the learning task.
        Args:
            task: Natural language task description
        Returns:
            True if successful
        """
        if not self.agent or not self.current_session:
            raise RuntimeError("Session not started. Call start_session() first.")
        try:
            result = self.agent.run(task)
            # Mark session as inactive after task completes
            if self.current_session:
                self.current_session.is_active = False
                self._save_session()
                print(f"[VideoLearning] Session completed. Recorded {self.video_counter} videos.")
            return bool(result)
        except Exception as e:
            print(f"Error during learning: {e}")
            if self.current_session:
                self.current_session.is_active = False
            return False
    def stop_session(self):
        """Stop the current learning session."""
        if self.current_session:
            self.current_session.is_active = False
        if self.agent:
            # Agent will stop on next callback check
            pass
    def pause_session(self):
        """Pause the current session (can be resumed)."""
        if self.current_session:
            self.current_session.is_paused = True
    def resume_session(self):
        """Resume a paused session."""
        if self.current_session:
            self.current_session.is_paused = False
    def _on_step(self, result: StepResult) -> Optional[str]:
        """
        Callback after each step.
        Simplified logic:
        1. Check if we're in the target app using get_current_app()
        2. Detect screenshot changes
        3. Record video when screenshot changes
        Args:
            result: Step execution result
        Returns:
            "stop" to end session, new task to switch, None to continue
        """
        if not self.current_session:
            return None
        # Check if session should stop
        if not self.current_session.is_active:
            self._save_session()
            if self.on_session_complete:
                self.on_session_complete(self.current_session)
            return "stop"
        # Check if paused
        if self.current_session.is_paused:
            return None
        # Check if we've watched enough videos
        if self.video_counter >= self.current_session.target_count:
            self.current_session.is_active = False
            self._save_session()
            if self.on_session_complete:
                self.on_session_complete(self.current_session)
            return "stop"
        try:
            # Use get_current_app() to detect if we're in target app
            current_app = get_device_factory().get_current_app(self._device_id)
            # Platform-specific package names
            platform_packages = {
                "douyin": ["aweme", "抖音", "douyin"],
                "kuaishou": ["gifmaker", "快手", "kuaishou"],
                "tiktok": ["musically", "tiktok"],
            }
            packages = platform_packages.get(self.platform, ["aweme"])
            # Check if in target app
            is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)
            if not is_in_target:
                print(f"[VideoLearning] Not in target app: {current_app} (step {result.step_count})")
                return None
            # Get screenshot
            screenshot = get_device_factory().get_screenshot(self._device_id)
            # Use full base64 data for hash (more sensitive)
            current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
            # Detect screenshot change and record video
            if self._last_screenshot_hash is None:
                # First screenshot in target app - record first video
                self._last_screenshot_hash = current_hash
                self._record_video_from_screenshot(screenshot)
                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
                # Check if we've reached target after recording
                if self.video_counter >= self.current_session.target_count:
                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
                    self.current_session.is_active = False
                    self._save_session()
                    return "stop"
            elif current_hash != self._last_screenshot_hash:
                # Screenshot changed - record new video
                self._last_screenshot_hash = current_hash
                self._record_video_from_screenshot(screenshot)
                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
                # Check if we've reached target after recording
                if self.video_counter >= self.current_session.target_count:
                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
                    self.current_session.is_active = False
                    self._save_session()
                    return "stop"
        except Exception as e:
            print(f"[VideoLearning] Warning: {e}")
        return None
    def _record_video_from_screenshot(self, screenshot):
        """Helper method to record video from screenshot."""
        import base64
        screenshot_bytes = base64.b64decode(screenshot.base64_data)
        self.record_video(
            screenshot=screenshot_bytes,
            description=f"Video #{self.video_counter + 1}",
        )
    def _before_action(self, action: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Callback before executing an action.
        Args:
            action: Action to execute
        Returns:
            Modified action or None
        """
        # Could be used for action logging or modification
        return None
    def record_video(
        self,
        screenshot: Optional[bytes] = None,
        description: Optional[str] = None,
        likes: Optional[int] = None,
        comments: Optional[int] = None,
    ) -> VideoRecord:
        """
        Record a watched video.
        Args:
            screenshot: Screenshot image data
            description: Video description/caption
            likes: Number of likes
            comments: Number of comments
        Returns:
            VideoRecord object
        """
        self.video_counter += 1
        # Save screenshot if provided
        screenshot_path = None
        if screenshot:
            screenshot_filename = f"{self.current_session.session_id}_video_{self.video_counter}.png"
            screenshot_full_path = self.screenshot_dir / screenshot_filename
            # Store relative path for web access: /video-learning-data/screenshots/filename.png
            screenshot_path = f"/video-learning-data/screenshots/{screenshot_filename}"
            with open(str(screenshot_full_path), "wb") as f:
                f.write(screenshot)
        # Create record
        record = VideoRecord(
            sequence_id=self.video_counter,
            timestamp=datetime.now().isoformat(),
            screenshot_path=screenshot_path,
            watch_duration=self._watch_duration,
            description=description,
            likes=likes,
            comments=comments,
            position_in_session=self.video_counter,
        )
        # Add to session
        if self.current_session:
            self.current_session.records.append(record)
            self.current_session.total_videos = self.video_counter
            self.current_session.total_duration += self._watch_duration
            # Notify callback
            if self.on_video_watched:
                self.on_video_watched(record)
            # Notify progress
            if self.on_progress_update:
                self.on_progress_update(self.video_counter, self.current_session.target_count)
        return record
    def _save_session(self):
        """Save session data to JSON file."""
        if not self.current_session:
            return
        session_file = self.output_dir / f"{self.current_session.session_id}.json"
        with open(session_file, "w", encoding="utf-8") as f:
            json.dump(self.current_session.to_dict(), f, ensure_ascii=False, indent=2)
        print(f"Session saved to {session_file}")
    def export_data(self, format: str = "json") -> str:
        """
        Export session data.
        Args:
            format: Export format (json, csv)
        Returns:
            Path to exported file
        """
        if not self.current_session:
            raise RuntimeError("No session to export")
        if format == "json":
            return self._export_json()
        elif format == "csv":
            return self._export_csv()
        else:
            raise ValueError(f"Unsupported format: {format}")
    def _export_json(self) -> str:
        """Export as JSON."""
        output_file = self.output_dir / f"{self.current_session.session_id}_export.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(self.current_session.to_dict(), f, ensure_ascii=False, indent=2)
        return str(output_file)
    def _export_csv(self) -> str:
        """Export as CSV."""
        import csv
        output_file = self.output_dir / f"{self.current_session.session_id}_export.csv"
        with open(output_file, "w", encoding="utf-8", newline="") as f:
            if not self.current_session.records:
                return str(output_file)
            writer = csv.DictWriter(f, fieldnames=self.current_session.records[0].to_dict().keys())
            writer.writeheader()
            for record in self.current_session.records:
                writer.writerow(record.to_dict())
        return str(output_file)
    def get_session_progress(self) -> Dict[str, Any]:
        """Get current session progress."""
        if not self.current_session:
            return {"status": "no_session"}
        return {
            "session_id": self.current_session.session_id,
            "platform": self.current_session.platform,
            "target_count": self.current_session.target_count,
            "watched_count": self.video_counter,
            "progress_percent": (self.video_counter / self.current_session.target_count * 100)
            if self.current_session.target_count > 0
            else 0,
            "is_active": self.current_session.is_active,
            "is_paused": self.current_session.is_paused,
            "total_duration": self.current_session.total_duration,
        }
 # Convenience function for standalone usage
 def create_video_learning_agent(
    base_url: str,
    api_key: str,
    model_name: str = "autoglm-phone-9b",
    platform: str = "douyin",
    output_dir: str = "./video_learning_data",
    **model_kwargs,
 ) -> VideoLearningAgent:
    """
    Create a Video Learning Agent with standard configuration.
    Args:
        base_url: Model API base URL
        api_key: API key
        model_name: Model name
        platform: Platform name
        output_dir: Output directory
        **model_kwargs: Additional model parameters
    Returns:
        VideoLearningAgent instance
    """
    model_config = ModelConfig(
        base_url=base_url,
        model_name=model_name,
        api_key=api_key,
        **model_kwargs,
    )
    return VideoLearningAgent(
        model_config=model_config,
        platform=platform,
        output_dir=output_dir,
    )
--- a/scripts/run_video_learning_demo.bat
+++ b/scripts/run_video_learning_demo.bat
@@ -0,0 +1,35 @@
@echo off
 REM Video Learning Demo Script for Windows
 REM This script starts the dashboard and opens the video learning page
 echo ============================================
 echo AutoGLM Video Learning Demo
 echo ============================================
 echo.
 echo Starting Dashboard...
 echo.
 REM Start the dashboard in background
 start "AutoGLM Dashboard" python -m uvicorn dashboard.main:app --host 0.0.0.0 --port 8080 --reload
 echo Waiting for dashboard to start...
 timeout /t 3 /nobreak > nul
 echo.
 echo Dashboard starting at: http://localhost:8080
 echo Opening Video Learning page in browser...
 echo.
 REM Open the video learning page
 start http://localhost:8080/static/video-learning.html
 echo.
 echo ============================================
 echo Video Learning Demo is ready!
 echo ============================================
 echo.
 echo Press Ctrl+C to stop the dashboard
 echo.
 REM Keep the script running
 pause
--- a/scripts/run_video_learning_demo.sh
+++ b/scripts/run_video_learning_demo.sh
@@ -0,0 +1,43 @@
 #!/bin/bash
 # Video Learning Demo Script for Linux/Mac
 # This script starts the dashboard and opens the video learning page
 echo "============================================"
 echo "AutoGLM Video Learning Demo"
 echo "============================================"
 echo ""
 echo "Starting Dashboard..."
 echo ""
 # Start the dashboard in background
 python -m uvicorn dashboard.main:app --host 0.0.0.0 --port 8080 --reload &
 DASHBOARD_PID=$!
 echo "Waiting for dashboard to start..."
 sleep 3
 echo ""
 echo "Dashboard starting at: http://localhost:8080"
 echo "Opening Video Learning page in browser..."
 echo ""
 # Open the video learning page
 if command -v xdg-open > /dev/null; then
    xdg-open http://localhost:8080/static/video-learning.html
 elif command -v open > /dev/null; then
    open http://localhost:8080/static/video-learning.html
 else
    echo "Please open your browser and navigate to:"
    echo "http://localhost:8080/static/video-learning.html"
 fi
 echo ""
 echo "============================================"
 echo "Video Learning Demo is ready!"
 echo "============================================"
 echo ""
 echo "Press Ctrl+C to stop the dashboard"
 echo ""
 # Wait for dashboard process
 wait $DASHBOARD_PID