Add Video Learning Agent for short video platforms

Features: - VideoLearningAgent for automated video watching on Douyin/Kuaishou/TikTok - Web dashboard UI for video learning sessions - Real-time progress tracking with screenshot capture - App detection using get_current_app() for accurate recording - Session management with pause/resume/stop controls Technical improvements: - Simplified video detection logic using direct app detection - Full base64 hash for sensitive screenshot change detection - Immediate stop when target video count is reached - Fixed circular import issues with ModelConfig Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 22:54:57 +08:00
parent 3552df23d6
commit 5b3f214e20
15 changed files with 2317 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@@ -108,3 +108,16 @@ SCREENSHOT_THROTTLE_MS=500

 # Maximum task history to keep / 保留的最大任务历史数
 MAX_TASK_HISTORY=100
+
+# ============================================================================
+# Video Learning Configuration / 视频学习配置
+# ============================================================================
+
+# Output directory for video learning data / 视频学习数据输出目录
+VIDEO_LEARNING_OUTPUT_DIR=./video_learning_data
+
+# Model parameters for video learning / 视频学习模型参数
+PHONE_AGENT_MAX_TOKENS=3000
+PHONE_AGENT_TEMPERATURE=0.0
+PHONE_AGENT_TOP_P=0.85
+PHONE_AGENT_FREQUENCY_PENALTY=0.2
--- a/dashboard/api/init.py
+++ b/dashboard/api/init.py
@@ -5,9 +5,11 @@ API endpoints for the dashboard.
 from dashboard.api.devices import router as devices_router
 from dashboard.api.tasks import router as tasks_router
 from dashboard.api.websocket import router as websocket_router
+from dashboard.api.video_learning import router as video_learning_router

 __all__ = [
    "devices_router",
    "tasks_router",
    "websocket_router",
+    "video_learning_router",
 ]
--- a/dashboard/api/video_learning.py
+++ b/dashboard/api/video_learning.py
@@ -0,0 +1,328 @@
+"""
+Video Learning API endpoints for the dashboard.
+"""
+
+import asyncio
+from datetime import datetime
+from typing import Dict, List, Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, Field
+
+from dashboard.config import config
+from dashboard.dependencies import get_device_manager
+from dashboard.services.device_manager import DeviceManager
+from phone_agent import VideoLearningAgent
+from phone_agent.model.client import ModelConfig
+
+router = APIRouter(prefix="/api/video-learning", tags=["video-learning"])
+
+
+class SessionCreateRequest(BaseModel):
+    """Request to create a new learning session."""
+
+    device_id: str = Field(..., description="Target device ID")
+    platform: str = Field("douyin", description="Platform name (douyin, kuaishou, tiktok)")
+    target_count: int = Field(10, description="Number of videos to watch", ge=1, le=100)
+    category: Optional[str] = Field(None, description="Target category filter")
+    watch_duration: float = Field(3.0, description="Watch duration per video (seconds)", ge=1.0, le=30.0)
+
+
+class SessionControlRequest(BaseModel):
+    """Request to control a session."""
+
+    action: str = Field(..., description="Action: pause, resume, stop")
+
+
+class SessionStatus(BaseModel):
+    """Session status response."""
+
+    session_id: str
+    platform: str
+    target_count: int
+    watched_count: int
+    progress_percent: float
+    is_active: bool
+    is_paused: bool
+    total_duration: float
+    current_video: Optional[Dict] = None
+
+
+class VideoInfo(BaseModel):
+    """Information about a watched video."""
+
+    sequence_id: int
+    timestamp: str
+    screenshot_path: Optional[str] = None
+    watch_duration: float
+    description: Optional[str] = None
+    likes: Optional[int] = None
+    comments: Optional[int] = None
+    tags: List[str] = []
+    category: Optional[str] = None
+
+
+# Global session storage (in production, use database)
+_active_sessions: Dict[str, VideoLearningAgent] = {}
+
+
+@router.post("/sessions", response_model=Dict[str, str])
+async def create_session(
+    request: SessionCreateRequest,
+    device_manager: DeviceManager = Depends(get_device_manager),
+) -> Dict[str, str]:
+    """Create a new video learning session."""
+    # Check device availability
+    device = await device_manager.get_device(request.device_id)
+    if not device:
+        raise HTTPException(status_code=404, detail="Device not found")
+
+    if not device.is_connected:
+        raise HTTPException(status_code=400, detail="Device not connected")
+
+    if device.status == "busy":
+        raise HTTPException(status_code=409, detail="Device is busy")
+
+    # Create model config from environment
+    model_config = ModelConfig(
+        base_url=config.MODEL_BASE_URL,
+        model_name=config.MODEL_NAME,
+        api_key=config.MODEL_API_KEY,
+        max_tokens=config.MAX_TOKENS,
+        temperature=config.TEMPERATURE,
+        top_p=config.TOP_P,
+        frequency_penalty=config.FREQUENCY_PENALTY,
+        lang="cn",
+    )
+
+    # Create video learning agent
+    agent = VideoLearningAgent(
+        model_config=model_config,
+        platform=request.platform,
+        output_dir=config.VIDEO_LEARNING_OUTPUT_DIR,
+    )
+
+    # Setup callbacks for real-time updates
+    session_id = None
+
+    def on_video_watched(record):
+        """Callback when a video is watched."""
+        # Broadcast via WebSocket
+        if session_id:
+            # This would be integrated with WebSocket manager
+            pass
+
+    def on_progress_update(current, total):
+        """Callback for progress updates."""
+        if session_id:
+            # Broadcast progress
+            pass
+
+    def on_session_complete(session):
+        """Callback when session completes."""
+        if session_id and session_id in _active_sessions:
+            del _active_sessions[session_id]
+
+    agent.on_video_watched = on_video_watched
+    agent.on_progress_update = on_progress_update
+    agent.on_session_complete = on_session_complete
+
+    # Start session
+    session_id = agent.start_session(
+        device_id=request.device_id,
+        target_count=request.target_count,
+        category=request.category,
+        watch_duration=request.watch_duration,
+        max_steps=500,
+    )
+
+    # Store session
+    _active_sessions[session_id] = agent
+
+    return {"session_id": session_id, "status": "created"}
+
+
+@router.post("/sessions/{session_id}/start", response_model=Dict[str, str])
+async def start_session(session_id: str) -> Dict[str, str]:
+    """Start executing a learning session."""
+    if session_id not in _active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    agent = _active_sessions[session_id]
+
+    # Build task based on session parameters
+    session = agent.current_session
+    if not session:
+        raise HTTPException(status_code=400, detail="Session not initialized")
+
+    category = session.target_category
+    target_count = session.target_count
+    watch_duration = agent._watch_duration
+    platform = agent.platform
+
+    # Platform-specific app name and package
+    platform_info = {
+        "douyin": {
+            "name": "抖音",
+            "package": "com.ss.android.ugc.aweme",
+        },
+        "kuaishou": {
+            "name": "快手",
+            "package": "com.smile.gifmaker",
+        },
+        "tiktok": {
+            "name": "TikTok",
+            "package": "com.zhiliaoapp.musically",
+        },
+    }
+
+    info = platform_info.get(platform, platform_info["douyin"])
+    app_name = info["name"]
+
+    # Build clear task instructions
+    if category:
+        task = f"""你是一个视频学习助手。请严格按照以下步骤执行：
+
+步骤1：启动应用
+- 回到主屏幕
+- 打开{app_name}应用
+
+步骤2：搜索内容
+- 在{app_name}中搜索"{category}"
+- 点击第一个搜索结果或进入相关页面
+
+步骤3：观看视频
+- 观看视频，每个视频停留约{watch_duration}秒
+- 记录视频的描述、点赞数、评论数
+- 向上滑动切换到下一个视频
+- 重复观看和记录，直到完成{target_count}个视频
+
+步骤4：完成任务
+- 完成观看{target_count}个视频后，总结所有视频信息
+
+请现在开始执行。"""
+    else:
+        task = f"""你是一个视频学习助手。请严格按照以下步骤执行：
+
+步骤1：启动应用
+- 回到主屏幕
+- 打开{app_name}应用
+
+步骤2：观看推荐视频
+- 进入{app_name}的推荐页面
+- 观看推荐视频，每个视频停留约{watch_duration}秒
+- 记录视频的描述、点赞数、评论数
+- 向上滑动切换到下一个视频
+- 重复观看和记录，直到完成{target_count}个视频
+
+步骤3：完成任务
+- 完成观看{target_count}个视频后，总结所有视频信息
+
+请现在开始执行。"""
+
+    # Run in background
+    asyncio.create_task(asyncio.to_thread(agent.run_learning_task, task))
+
+    return {"session_id": session_id, "status": "started"}
+
+
+@router.post("/sessions/{session_id}/control", response_model=Dict[str, str])
+async def control_session(
+    session_id: str, request: SessionControlRequest
+) -> Dict[str, str]:
+    """Control a learning session (pause/resume/stop)."""
+    if session_id not in _active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    agent = _active_sessions[session_id]
+
+    if request.action == "pause":
+        agent.pause_session()
+        return {"session_id": session_id, "status": "paused"}
+    elif request.action == "resume":
+        agent.resume_session()
+        return {"session_id": session_id, "status": "resumed"}
+    elif request.action == "stop":
+        agent.stop_session()
+        # Remove from active sessions
+        del _active_sessions[session_id]
+        return {"session_id": session_id, "status": "stopped"}
+    else:
+        raise HTTPException(status_code=400, detail=f"Invalid action: {request.action}")
+
+
+@router.get("/sessions/{session_id}/status", response_model=SessionStatus)
+async def get_session_status(session_id: str) -> SessionStatus:
+    """Get session status."""
+    if session_id not in _active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    agent = _active_sessions[session_id]
+    progress = agent.get_session_progress()
+
+    # Get current video info if available
+    current_video = None
+    if agent.current_session and agent.current_session.records:
+        latest = agent.current_session.records[-1]
+        current_video = {
+            "sequence_id": latest.sequence_id,
+            "timestamp": latest.timestamp,
+            "screenshot_path": latest.screenshot_path,
+            "description": latest.description,
+            "likes": latest.likes,
+            "comments": latest.comments,
+        }
+
+    return SessionStatus(
+        session_id=progress["session_id"],
+        platform=progress["platform"],
+        target_count=progress["target_count"],
+        watched_count=progress["watched_count"],
+        progress_percent=progress["progress_percent"],
+        is_active=progress["is_active"],
+        is_paused=progress["is_paused"],
+        total_duration=progress["total_duration"],
+        current_video=current_video,
+    )
+
+
+@router.get("/sessions/{session_id}/videos", response_model=List[VideoInfo])
+async def get_session_videos(session_id: str) -> List[VideoInfo]:
+    """Get all videos from a session."""
+    if session_id not in _active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    agent = _active_sessions[session_id]
+    if not agent.current_session:
+        return []
+
+    return [
+        VideoInfo(
+            sequence_id=r.sequence_id,
+            timestamp=r.timestamp,
+            screenshot_path=r.screenshot_path,
+            watch_duration=r.watch_duration,
+            description=r.description,
+            likes=r.likes,
+            comments=r.comments,
+            tags=r.tags,
+            category=r.category,
+        )
+        for r in agent.current_session.records
+    ]
+
+
+@router.get("/sessions", response_model=List[str])
+async def list_sessions() -> List[str]:
+    """List all active session IDs."""
+    return list(_active_sessions.keys())
+
+
+@router.delete("/sessions/{session_id}", response_model=Dict[str, str])
+async def delete_session(session_id: str) -> Dict[str, str]:
+    """Delete a session."""
+    if session_id not in _active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    del _active_sessions[session_id]
+    return {"session_id": session_id, "status": "deleted"}
--- a/dashboard/config.py
+++ b/dashboard/config.py
@@ -39,6 +39,13 @@ class DashboardConfig:
    MODEL_BASE_URL: str = os.getenv("PHONE_AGENT_BASE_URL", "http://localhost:8000/v1")
    MODEL_NAME: str = os.getenv("PHONE_AGENT_MODEL", "autoglm-phone-9b")
    MODEL_API_KEY: str = os.getenv("PHONE_AGENT_API_KEY", "EMPTY")
+    MAX_TOKENS: int = int(os.getenv("PHONE_AGENT_MAX_TOKENS", "3000"))
+    TEMPERATURE: float = float(os.getenv("PHONE_AGENT_TEMPERATURE", "0.0"))
+    TOP_P: float = float(os.getenv("PHONE_AGENT_TOP_P", "0.85"))
+    FREQUENCY_PENALTY: float = float(os.getenv("PHONE_AGENT_FREQUENCY_PENALTY", "0.2"))
+
+    # Video learning settings
+    VIDEO_LEARNING_OUTPUT_DIR: str = os.getenv("VIDEO_LEARNING_OUTPUT_DIR", "./video_learning_data")

    # Task history
    MAX_TASK_HISTORY: int = int(os.getenv("MAX_TASK_HISTORY", "100"))
--- a/dashboard/main.py
+++ b/dashboard/main.py
@@ -16,7 +16,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles

-from dashboard.api import devices_router, tasks_router, websocket_router
+from dashboard.api import devices_router, tasks_router, websocket_router, video_learning_router
 from dashboard.config import config
 from dashboard.dependencies import (
    get_device_manager,
@@ -104,6 +104,7 @@ async def global_exception_handler(request: Request, exc: Exception):
 app.include_router(devices_router, prefix="/api")
 app.include_router(tasks_router, prefix="/api")
 app.include_router(websocket_router)
+app.include_router(video_learning_router)


 # Health check
@@ -163,6 +164,12 @@ if static_path.exists():
    app.mount("/static", StaticFiles(directory=str(static_path)), name="static")


+# Mount static files for video learning screenshots
+video_learning_data_path = Path(config.VIDEO_LEARNING_OUTPUT_DIR)
+if video_learning_data_path.exists():
+    app.mount("/video-learning-data", StaticFiles(directory=str(video_learning_data_path)), name="video-learning-data")
+
+
 # Run script entry point
 if __name__ == "__main__":
    import uvicorn
--- a/dashboard/static/css/video-learning.css
+++ b/dashboard/static/css/video-learning.css
@@ -0,0 +1,283 @@
+/* Video Learning Module Styles */
+
+/* Header modifications */
+.header h1 {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+}
+
+/* Configuration Section */
+.config-section {
+    background-color: var(--card-bg);
+    border: 1px solid var(--border-color);
+    border-radius: 12px;
+    padding: 2rem;
+    max-width: 800px;
+    margin: 0 auto;
+}
+
+.config-form {
+    display: flex;
+    flex-direction: column;
+    gap: 1.5rem;
+}
+
+.form-group {
+    display: flex;
+    flex-direction: column;
+    gap: 0.5rem;
+}
+
+.form-group label {
+    font-size: 0.875rem;
+    font-weight: 500;
+    color: var(--text-primary);
+}
+
+.form-group select,
+.form-group input {
+    padding: 0.75rem 1rem;
+    background-color: var(--bg-color);
+    border: 1px solid var(--border-color);
+    border-radius: 8px;
+    color: var(--text-primary);
+    font-size: 0.95rem;
+}
+
+.form-group select:focus,
+.form-group input:focus {
+    outline: none;
+    border-color: var(--primary-color);
+}
+
+.form-group select:disabled,
+.form-group input:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+}
+
+.form-group small {
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+}
+
+.form-row {
+    display: grid;
+    grid-template-columns: 1fr 1fr;
+    gap: 1rem;
+}
+
+/* Session Section */
+.session-section {
+    background-color: var(--card-bg);
+    border: 1px solid var(--border-color);
+    border-radius: 12px;
+    padding: 2rem;
+}
+
+.session-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 1.5rem;
+}
+
+.session-header h2 {
+    font-size: 1.25rem;
+    font-weight: 600;
+    color: var(--text-primary);
+}
+
+.session-controls {
+    display: flex;
+    gap: 0.5rem;
+}
+
+/* Progress Section */
+.progress-section {
+    background-color: var(--bg-color);
+    border-radius: 8px;
+    padding: 1.5rem;
+    margin-bottom: 1.5rem;
+}
+
+.progress-info {
+    display: flex;
+    justify-content: space-between;
+    margin-bottom: 0.5rem;
+    font-size: 0.875rem;
+    color: var(--text-secondary);
+}
+
+.progress-bar-large {
+    height: 8px;
+    background-color: rgba(99, 102, 241, 0.2);
+    border-radius: 4px;
+    overflow: hidden;
+}
+
+.progress-fill {
+    height: 100%;
+    background-color: var(--primary-color);
+    transition: width 0.3s ease;
+}
+
+.progress-stats {
+    margin-top: 0.5rem;
+    font-size: 0.8rem;
+    color: var(--text-secondary);
+}
+
+/* Current Video */
+.current-video {
+    margin-bottom: 2rem;
+}
+
+.current-video h3 {
+    font-size: 1rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 1rem;
+}
+
+/* Video Cards */
+.video-card {
+    background-color: var(--bg-color);
+    border: 1px solid var(--border-color);
+    border-radius: 8px;
+    overflow: hidden;
+    transition: border-color 0.2s;
+}
+
+.video-card:hover {
+    border-color: var(--primary-color);
+}
+
+.video-screenshot {
+    width: 100%;
+    aspect-ratio: 9/16;
+    background-color: #000;
+    overflow: hidden;
+}
+
+.video-screenshot img {
+    width: 100%;
+    height: 100%;
+    object-fit: contain;
+}
+
+.video-placeholder {
+    width: 100%;
+    aspect-ratio: 9/16;
+    background-color: var(--bg-color);
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    color: var(--text-secondary);
+}
+
+.video-info {
+    padding: 1rem;
+}
+
+.video-id {
+    font-size: 0.75rem;
+    font-weight: 600;
+    color: var(--primary-color);
+    margin-bottom: 0.5rem;
+}
+
+.video-description {
+    font-size: 0.875rem;
+    color: var(--text-primary);
+    margin-bottom: 0.5rem;
+    line-height: 1.4;
+}
+
+.video-stats {
+    display: flex;
+    gap: 1rem;
+    font-size: 0.75rem;
+    color: var(--text-secondary);
+}
+
+.video-stats span {
+    display: flex;
+    align-items: center;
+    gap: 0.25rem;
+}
+
+.video-stats svg {
+    flex-shrink: 0;
+}
+
+/* Session Complete */
+.session-complete {
+    text-align: center;
+    padding: 3rem 2rem;
+}
+
+.complete-icon {
+    display: flex;
+    justify-content: center;
+    margin-bottom: 1rem;
+    color: var(--success-color);
+}
+
+.session-complete h3 {
+    font-size: 1.5rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 0.5rem;
+}
+
+.session-complete p {
+    color: var(--text-secondary);
+    margin-bottom: 1.5rem;
+}
+
+/* Video Grid */
+.video-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+    gap: 1rem;
+}
+
+.video-grid .video-card {
+    font-size: 0.875rem;
+}
+
+.video-grid .video-screenshot,
+.video-grid .video-placeholder {
+    aspect-ratio: 9/16;
+}
+
+/* History Section */
+.history-section {
+    margin-top: 2rem;
+}
+
+.history-section h2 {
+    font-size: 1.25rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 1rem;
+}
+
+/* Responsive */
+@media (max-width: 768px) {
+    .form-row {
+        grid-template-columns: 1fr;
+    }
+
+    .session-header {
+        flex-direction: column;
+        gap: 1rem;
+        align-items: flex-start;
+    }
+
+    .video-grid {
+        grid-template-columns: repeat(auto-fill, minmax(150px, 1fr));
+    }
+}
--- a/dashboard/static/index.html
+++ b/dashboard/static/index.html
@@ -41,6 +41,13 @@
                </div>
            </div>
            <div class="header-actions">
+                <a href="/static/video-learning.html" class="btn btn-primary">
+                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                        <polygon points="23 7 16 12 23 17 23 7"></polygon>
+                        <rect x="1" y="5" width="15" height="14" rx="2" ry="2"></rect>
+                    </svg>
+                    Video Learning
+                </a>
                <button @click="refreshDevices" class="btn btn-secondary" :disabled="refreshing">
                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" :class="{ spinning: refreshing }">
                        <polyline points="23 4 23 10 17 10"></polyline>
--- a/dashboard/static/js/video-learning.js
+++ b/dashboard/static/js/video-learning.js
@@ -0,0 +1,200 @@
+/**
+ * Video Learning Module for AutoGLM Dashboard
+ *
+ * This module provides UI and functionality for the Video Learning Agent,
+ * allowing users to watch and learn from short video platforms.
+ */
+
+const VideoLearningModule = {
+    // Current session state
+    currentSessionId: null,
+    currentSessionStatus: null,
+    videos: [],
+    isPolling: false,
+
+    // Create a new learning session
+    async createSession(deviceId, options = {}) {
+        const {
+            platform = 'douyin',
+            targetCount = 10,
+            category = null,
+            watchDuration = 3.0,
+        } = options;
+
+        try {
+            const response = await axios.post('/api/video-learning/sessions', {
+                device_id: deviceId,
+                platform: platform,
+                target_count: targetCount,
+                category: category,
+                watch_duration: watchDuration,
+            });
+
+            this.currentSessionId = response.data.session_id;
+            this.startPolling();
+
+            return response.data;
+        } catch (error) {
+            console.error('Error creating session:', error);
+            throw error;
+        }
+    },
+
+    // Start a session
+    async startSession(sessionId) {
+        try {
+            const response = await axios.post(`/api/video-learning/sessions/${sessionId}/start`);
+            return response.data;
+        } catch (error) {
+            console.error('Error starting session:', error);
+            throw error;
+        }
+    },
+
+    // Control a session (pause/resume/stop)
+    async controlSession(sessionId, action) {
+        try {
+            const response = await axios.post(`/api/video-learning/sessions/${sessionId}/control`, {
+                action: action,
+            });
+            return response.data;
+        } catch (error) {
+            console.error('Error controlling session:', error);
+            throw error;
+        }
+    },
+
+    // Get session status
+    async getSessionStatus(sessionId) {
+        try {
+            const response = await axios.get(`/api/video-learning/sessions/${sessionId}/status`);
+            this.currentSessionStatus = response.data;
+            return response.data;
+        } catch (error) {
+            console.error('Error getting session status:', error);
+            throw error;
+        }
+    },
+
+    // Get session videos
+    async getSessionVideos(sessionId) {
+        try {
+            const response = await axios.get(`/api/video-learning/sessions/${sessionId}/videos`);
+            this.videos = response.data;
+            return response.data;
+        } catch (error) {
+            console.error('Error getting session videos:', error);
+            throw error;
+        }
+    },
+
+    // List all active sessions
+    async listSessions() {
+        try {
+            const response = await axios.get('/api/video-learning/sessions');
+            return response.data;
+        } catch (error) {
+            console.error('Error listing sessions:', error);
+            throw error;
+        }
+    },
+
+    // Delete a session
+    async deleteSession(sessionId) {
+        try {
+            const response = await axios.delete(`/api/video-learning/sessions/${sessionId}`);
+            if (this.currentSessionId === sessionId) {
+                this.currentSessionId = null;
+                this.currentSessionStatus = null;
+                this.stopPolling();
+            }
+            return response.data;
+        } catch (error) {
+            console.error('Error deleting session:', error);
+            throw error;
+        }
+    },
+
+    // Start polling for session updates
+    startPolling(intervalMs = 1000) {
+        if (this.isPolling) return;
+
+        this.isPolling = true;
+        this.pollInterval = setInterval(async () => {
+            if (this.currentSessionId) {
+                try {
+                    await this.getSessionStatus(this.currentSessionId);
+                    await this.getSessionVideos(this.currentSessionId);
+
+                    // Trigger custom event for UI updates
+                    window.dispatchEvent(new CustomEvent('videoLearningUpdate', {
+                        detail: {
+                            sessionId: this.currentSessionId,
+                            status: this.currentSessionStatus,
+                            videos: this.videos,
+                        }
+                    }));
+
+                    // Stop polling if session is complete, but do one final update
+                    if (this.currentSessionStatus && !this.currentSessionStatus.is_active) {
+                        console.log('[VideoLearning] Session completed, doing final update...');
+                        // Do one final update to ensure we have the latest data
+                        await this.getSessionStatus(this.currentSessionId);
+                        await this.getSessionVideos(this.currentSessionId);
+
+                        window.dispatchEvent(new CustomEvent('videoLearningUpdate', {
+                            detail: {
+                                sessionId: this.currentSessionId,
+                                status: this.currentSessionStatus,
+                                videos: this.videos,
+                            }
+                        }));
+
+                        console.log('[VideoLearning] Final update complete, stopping poll');
+                        this.stopPolling();
+                    }
+                } catch (error) {
+                    console.error('Error polling session status:', error);
+                    // Don't stop polling on error, just log it
+                }
+            }
+        }, intervalMs);
+        console.log(`[VideoLearning] Started polling with ${intervalMs}ms interval`);
+    },
+
+    // Stop polling
+    stopPolling() {
+        if (this.pollInterval) {
+            clearInterval(this.pollInterval);
+            this.pollInterval = null;
+            console.log('[VideoLearning] Stopped polling');
+        }
+        this.isPolling = false;
+    },
+
+    // Format duration
+    formatDuration(seconds) {
+        if (seconds < 60) {
+            return `${seconds.toFixed(1)}s`;
+        }
+        const minutes = Math.floor(seconds / 60);
+        const remainingSeconds = seconds % 60;
+        return `${minutes}m ${remainingSeconds.toFixed(1)}s`;
+    },
+
+    // Format number with K/M suffix
+    formatNumber(num) {
+        if (num === null || num === undefined) return 'N/A';
+        if (num >= 1000000) {
+            return `${(num / 1000000).toFixed(1)}M`;
+        } else if (num >= 1000) {
+            return `${(num / 1000).toFixed(1)}K`;
+        }
+        return num.toString();
+    },
+};
+
+// Export for use in other modules
+if (typeof module !== 'undefined' && module.exports) {
+    module.exports = VideoLearningModule;
+}
--- a/dashboard/static/video-learning.html
+++ b/dashboard/static/video-learning.html
@@ -0,0 +1,412 @@
+<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Video Learning - AutoGLM Dashboard</title>
+    <!-- Vue.js 3 -->
+    <script src="https://unpkg.com/vue@3/dist/vue.global.js"></script>
+    <!-- Axios for API requests -->
+    <script src="https://unpkg.com/axios/dist/axios.min.js"></script>
+    <!-- CSS -->
+    <link rel="stylesheet" href="/static/css/dashboard.css">
+    <link rel="stylesheet" href="/static/css/video-learning.css">
+</head>
+<body>
+    <div id="app">
+        <!-- Header -->
+        <header class="header">
+            <div class="header-content">
+                <h1>
+                    <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                        <polygon points="23 7 16 12 23 17 23 7"></polygon>
+                        <rect x="1" y="5" width="15" height="14" rx="2" ry="2"></rect>
+                    </svg>
+                    Video Learning Agent
+                </h1>
+                <div class="stats">
+                    <span class="stat" title="Session Status">
+                        <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <circle cx="12" cy="12" r="10"></circle>
+                            <polyline points="12 6 12 12 16 14"></polyline>
+                        </svg>
+                        {{ sessionStatus ? sessionStatus.status : 'No Session' }}
+                    </span>
+                    <span class="stat" v-if="sessionStatus" title="Progress">
+                        <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <path d="M22 11.08V12a10 10 0 1 1-5.93-9.14"></path>
+                            <polyline points="22 4 12 14.01 9 11.01"></polyline>
+                        </svg>
+                        {{ sessionStatus.watched_count }} / {{ sessionStatus.target_count }}
+                    </span>
+                </div>
+            </div>
+            <div class="header-actions">
+                <button @click="goBack" class="btn btn-secondary">
+                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                        <line x1="19" y1="12" x2="5" y2="12"></line>
+                        <polyline points="12 19 5 12 12 5"></polyline>
+                    </svg>
+                    Back
+                </button>
+            </div>
+        </header>
+
+        <!-- Main Content -->
+        <main class="main-content">
+            <!-- Configuration Section -->
+            <section class="config-section" v-if="!currentSessionId">
+                <h2>Create Learning Session</h2>
+                <div class="config-form">
+                    <div class="form-group">
+                        <label>Device</label>
+                        <select v-model="config.deviceId" :disabled="loading">
+                            <option value="">Select a device...</option>
+                            <option v-for="device in devices" :key="device.device_id" :value="device.device_id"
+                                    :disabled="!device.is_connected || device.status === 'busy'">
+                                {{ device.device_id }}
+                                {{ !device.is_connected ? '(Disconnected)' : '' }}
+                                {{ device.status === 'busy' ? '(Busy)' : '' }}
+                            </option>
+                        </select>
+                    </div>
+
+                    <div class="form-group">
+                        <label>Platform</label>
+                        <select v-model="config.platform" :disabled="loading">
+                            <option value="douyin">Douyin (抖音)</option>
+                            <option value="kuaishou">Kuaishou (快手)</option>
+                            <option value="tiktok">TikTok</option>
+                        </select>
+                    </div>
+
+                    <div class="form-row">
+                        <div class="form-group">
+                            <label>Target Videos</label>
+                            <input type="number" v-model.number="config.targetCount" min="1" max="100" :disabled="loading">
+                        </div>
+                        <div class="form-group">
+                            <label>Watch Duration (s)</label>
+                            <input type="number" v-model.number="config.watchDuration" min="1" max="30" step="0.5" :disabled="loading">
+                        </div>
+                    </div>
+
+                    <div class="form-group">
+                        <label>Category (Optional)</label>
+                        <input type="text" v-model="config.category" placeholder="e.g., 美食, 旅行, 搞笑" :disabled="loading">
+                        <small>Leave empty to watch recommended videos</small>
+                    </div>
+
+                    <button @click="createAndStartSession" class="btn btn-primary" :disabled="loading || !config.deviceId">
+                        <svg v-if="loading" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" class="spinning">
+                            <path d="M21 12a9 9 0 1 1-6.219-8.56"></path>
+                        </svg>
+                        {{ loading ? 'Creating...' : 'Start Learning' }}
+                    </button>
+                </div>
+            </section>
+
+            <!-- Session Control Section -->
+            <section class="session-section" v-if="currentSessionId && sessionStatus">
+                <div class="session-header">
+                    <h2>Session: {{ currentSessionId }}</h2>
+                    <div class="session-controls">
+                        <button v-if="sessionStatus.is_paused" @click="resumeSession" class="btn btn-primary btn-sm">
+                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                <polygon points="5 3 19 12 5 21 5 3"></polygon>
+                            </svg>
+                            Resume
+                        </button>
+                        <button v-else-if="sessionStatus.is_active" @click="pauseSession" class="btn btn-secondary btn-sm">
+                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                <rect x="6" y="4" width="4" height="16"></rect>
+                                <rect x="14" y="4" width="4" height="16"></rect>
+                            </svg>
+                            Pause
+                        </button>
+                        <button v-if="sessionStatus.is_active" @click="stopSession" class="btn btn-danger btn-sm">
+                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                <rect x="6" y="6" width="12" height="12"></rect>
+                            </svg>
+                            Stop
+                        </button>
+                    </div>
+                </div>
+
+                <!-- Progress Bar -->
+                <div class="progress-section" v-if="sessionStatus.is_active || sessionStatus.is_paused">
+                    <div class="progress-info">
+                        <span>Progress: {{ sessionStatus.watched_count }} / {{ sessionStatus.target_count }}</span>
+                        <span>{{ Math.round(sessionStatus.progress_percent) }}%</span>
+                    </div>
+                    <div class="progress-bar-large">
+                        <div class="progress-fill" :style="{ width: sessionStatus.progress_percent + '%' }"></div>
+                    </div>
+                    <div class="progress-stats">
+                        <span>Total Duration: {{ formatDuration(sessionStatus.total_duration) }}</span>
+                    </div>
+                </div>
+
+                <!-- Current Video -->
+                <div class="current-video" v-if="sessionStatus.current_video">
+                    <h3>Current Video</h3>
+                    <div class="video-card">
+                        <div class="video-screenshot" v-if="sessionStatus.current_video.screenshot_path">
+                            <img :src="sessionStatus.current_video.screenshot_path" alt="Current video">
+                        </div>
+                        <div class="video-info">
+                            <div class="video-id">#{{ sessionStatus.current_video.sequence_id }}</div>
+                            <div class="video-description" v-if="sessionStatus.current_video.description">
+                                {{ sessionStatus.current_video.description }}
+                            </div>
+                            <div class="video-stats">
+                                <span v-if="sessionStatus.current_video.likes">
+                                    <svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor" stroke="none">
+                                        <path d="M20.84 4.61a5.5 5.5 0 0 0-7.78 0L12 5.67l-1.06-1.06a5.5 5.5 0 0 0-7.78 7.78l1.06 1.06L12 21.23l7.78-7.78 1.06-1.06a5.5 5.5 0 0 0 0-7.78z"></path>
+                                    </svg>
+                                    {{ formatNumber(sessionStatus.current_video.likes) }}
+                                </span>
+                                <span v-if="sessionStatus.current_video.comments">
+                                    <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                        <path d="M21 11.5a8.38 8.38 0 0 1-.9 3.8 8.5 8.5 0 0 1-7.6 4.7 8.38 8.38 0 0 1-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 0 1-.9-3.8 8.5 8.5 0 0 1 4.7-7.6 8.38 8.38 0 0 1 3.8-.9h.5a8.48 8.48 0 0 1 8 8v.5z"></path>
+                                    </svg>
+                                    {{ formatNumber(sessionStatus.current_video.comments) }}
+                                </span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+
+                <!-- Session Complete -->
+                <div class="session-complete" v-if="!sessionStatus.is_active && currentSessionId">
+                    <div class="complete-icon">
+                        <svg width="64" height="64" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                            <path d="M22 11.08V12a10 10 0 1 1-5.93-9.14"></path>
+                            <polyline points="22 4 12 14.01 9 11.01"></polyline>
+                        </svg>
+                    </div>
+                    <h3>Session Complete!</h3>
+                    <p>Watched {{ sessionStatus.watched_count }} videos in {{ formatDuration(sessionStatus.total_duration) }}</p>
+                    <button @click="resetSession" class="btn btn-primary">Start New Session</button>
+                </div>
+            </section>
+
+            <!-- Video History -->
+            <section class="history-section" v-if="videos.length > 0">
+                <h2>Watched Videos</h2>
+                <div class="video-grid">
+                    <div v-for="video in videos" :key="video.sequence_id" class="video-card">
+                        <div class="video-screenshot" v-if="video.screenshot_path">
+                            <img :src="video.screenshot_path" :alt="'Video ' + video.sequence_id">
+                        </div>
+                        <div class="video-placeholder" v-else>
+                            <svg width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                <rect x="2" y="2" width="20" height="20" rx="2.18" ry="2.18"></rect>
+                                <line x1="7" y1="2" x2="7" y2="22"></line>
+                                <line x1="17" y1="2" x2="17" y2="22"></line>
+                                <line x1="2" y1="12" x2="22" y2="12"></line>
+                                <line x1="2" y1="7" x2="7" y2="7"></line>
+                                <line x1="2" y1="17" x2="7" y2="17"></line>
+                                <line x1="17" y1="17" x2="22" y2="17"></line>
+                                <line x1="17" y1="7" x2="22" y2="7"></line>
+                            </svg>
+                        </div>
+                        <div class="video-info">
+                            <div class="video-id">#{{ video.sequence_id }}</div>
+                            <div class="video-description" v-if="video.description">{{ video.description }}</div>
+                            <div class="video-stats">
+                                <span v-if="video.likes">
+                                    <svg width="12" height="12" viewBox="0 0 24 24" fill="currentColor" stroke="none">
+                                        <path d="M20.84 4.61a5.5 5.5 0 0 0-7.78 0L12 5.67l-1.06-1.06a5.5 5.5 0 0 0-7.78 7.78l1.06 1.06L12 21.23l7.78-7.78 1.06-1.06a5.5 5.5 0 0 0 0-7.78z"></path>
+                                    </svg>
+                                    {{ formatNumber(video.likes) }}
+                                </span>
+                                <span v-if="video.comments">
+                                    <svg width="12" height="12" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                        <path d="M21 11.5a8.38 8.38 0 0 1-.9 3.8 8.5 8.5 0 0 1-7.6 4.7 8.38 8.38 0 0 1-3.8-.9L3 21l1.9-5.7a8.38 8.38 0 0 1-.9-3.8 8.5 8.5 0 0 1 4.7-7.6 8.38 8.38 0 0 1 3.8-.9h.5a8.48 8.48 0 0 1 8 8v.5z"></path>
+                                    </svg>
+                                    {{ formatNumber(video.comments) }}
+                                </span>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </section>
+        </main>
+
+        <!-- Toast notifications -->
+        <div class="toast-container">
+            <div v-for="toast in toasts" :key="toast.id" class="toast" :class="toast.type">
+                {{ toast.message }}
+            </div>
+        </div>
+    </div>
+
+    <script src="/static/js/video-learning.js"></script>
+    <script>
+        const { createApp } = Vue;
+
+        createApp({
+            data() {
+                return {
+                    devices: [],
+                    currentSessionId: null,
+                    sessionStatus: null,
+                    videos: [],
+                    loading: false,
+                    toasts: [],
+                    toastIdCounter: 0,
+
+                    config: {
+                        deviceId: '',
+                        platform: 'douyin',
+                        targetCount: 10,
+                        category: '',
+                        watchDuration: 3.0,
+                    },
+                };
+            },
+
+            mounted() {
+                this.loadDevices();
+                this.setupVideoLearningEvents();
+            },
+
+            methods: {
+                async loadDevices() {
+                    try {
+                        const response = await axios.get('/api/devices');
+                        this.devices = response.data;
+                    } catch (error) {
+                        this.showToast('Failed to load devices', 'error');
+                    }
+                },
+
+                async createAndStartSession() {
+                    if (!this.config.deviceId) {
+                        this.showToast('Please select a device', 'error');
+                        return;
+                    }
+
+                    this.loading = true;
+                    try {
+                        // Create session
+                        const createResult = await VideoLearningModule.createSession(
+                            this.config.deviceId,
+                            {
+                                platform: this.config.platform,
+                                targetCount: this.config.targetCount,
+                                category: this.config.category || null,
+                                watchDuration: this.config.watchDuration,
+                            }
+                        );
+
+                        this.currentSessionId = createResult.session_id;
+                        this.showToast('Session created! Starting...', 'success');
+
+                        // Start session
+                        await VideoLearningModule.startSession(this.currentSessionId);
+                        this.showToast('Learning session started!', 'success');
+
+                        // Initial status update
+                        await this.updateSessionStatus();
+                    } catch (error) {
+                        this.showToast('Failed to create session: ' + error.message, 'error');
+                    } finally {
+                        this.loading = false;
+                    }
+                },
+
+                async pauseSession() {
+                    if (!this.currentSessionId) return;
+
+                    try {
+                        await VideoLearningModule.controlSession(this.currentSessionId, 'pause');
+                        await this.updateSessionStatus();
+                        this.showToast('Session paused', 'info');
+                    } catch (error) {
+                        this.showToast('Failed to pause session', 'error');
+                    }
+                },
+
+                async resumeSession() {
+                    if (!this.currentSessionId) return;
+
+                    try {
+                        await VideoLearningModule.controlSession(this.currentSessionId, 'resume');
+                        await this.updateSessionStatus();
+                        this.showToast('Session resumed', 'info');
+                    } catch (error) {
+                        this.showToast('Failed to resume session', 'error');
+                    }
+                },
+
+                async stopSession() {
+                    if (!this.currentSessionId) return;
+
+                    if (!confirm('Are you sure you want to stop this session?')) return;
+
+                    try {
+                        await VideoLearningModule.controlSession(this.currentSessionId, 'stop');
+                        await this.updateSessionStatus();
+                        this.showToast('Session stopped', 'info');
+                    } catch (error) {
+                        this.showToast('Failed to stop session', 'error');
+                    }
+                },
+
+                async updateSessionStatus() {
+                    if (!this.currentSessionId) return;
+
+                    try {
+                        this.sessionStatus = await VideoLearningModule.getSessionStatus(this.currentSessionId);
+                        this.videos = await VideoLearningModule.getSessionVideos(this.currentSessionId);
+                    } catch (error) {
+                        console.error('Error updating session status:', error);
+                    }
+                },
+
+                setupVideoLearningEvents() {
+                    window.addEventListener('videoLearningUpdate', (event) => {
+                        const { status, videos } = event.detail;
+                        this.sessionStatus = status;
+                        this.videos = videos;
+                    });
+                },
+
+                resetSession() {
+                    this.currentSessionId = null;
+                    this.sessionStatus = null;
+                    this.videos = [];
+                    VideoLearningModule.stopPolling();
+                },
+
+                goBack() {
+                    window.location.href = '/';
+                },
+
+                formatDuration(seconds) {
+                    return VideoLearningModule.formatDuration(seconds);
+                },
+
+                formatNumber(num) {
+                    return VideoLearningModule.formatNumber(num);
+                },
+
+                showToast(message, type = 'info') {
+                    const id = this.toastIdCounter++;
+                    this.toasts.push({ id, message, type });
+
+                    setTimeout(() => {
+                        this.toasts = this.toasts.filter(t => t.id !== id);
+                    }, 3000);
+                },
+            },
+
+            beforeUnmount() {
+                VideoLearningModule.stopPolling();
+            },
+        }).mount('#app');
+    </script>
+</body>
+</html>
--- a/docs/VIDEO_LEARNING.md
+++ b/docs/VIDEO_LEARNING.md
@@ -0,0 +1,253 @@
+# Video Learning Agent
+
+AI-powered agent for learning from short video platforms like Douyin (抖音), Kuaishou (快手), and TikTok.
+
+## 功能特性
+
+### MVP 功能
+- **自动滑动**: 自动在视频之间滑动切换
+- **播放控制**: 播放/暂停控制
+- **截图记录**: 为每个视频截图保存
+- **数据采集**: 采集视频描述、点赞数、评论数
+- **可视化管理**: 通过 Web Dashboard 可视化控制
+- **会话管理**: 创建、暂停、恢复、停止学习会话
+- **数据导出**: 导出学习数据（JSON/CSV）
+
+## 快速开始
+
+### 1. 启动 Dashboard
+
+```bash
+# 使用脚本启动（推荐）
+python scripts/run_video_learning_demo.bat   # Windows
+bash scripts/run_video_learning_demo.sh      # Linux/Mac
+
+# 或手动启动
+python -m uvicorn dashboard.main:app --host 0.0.0.0 --port 8080 --reload
+```
+
+### 2. 访问 Video Learning 页面
+
+打开浏览器访问: `http://localhost:8080/static/video-learning.html`
+
+或从主 Dashboard 页面点击 "Video Learning" 按钮。
+
+### 3. 创建学习会话
+
+1. 选择设备
+2. 选择平台（抖音/快手/TikTok）
+3. 设置目标视频数量
+4. （可选）设置类别筛选
+5. 设置观看时长
+6. 点击 "Start Learning"
+
+## 使用示例
+
+### 独立运行
+
+```bash
+python examples/video_learning_demo.py \
+    --device-id emulator-5554 \
+    --count 10 \
+    --category "美食" \
+    --watch-duration 3.0
+```
+
+### 通过 Dashboard
+
+1. 打开 Video Learning 页面
+2. 配置学习参数
+3. 点击启动
+4. 实时查看进度
+
+### API 调用
+
+```python
+from phone_agent import VideoLearningAgent
+from phone_agent.model.client import ModelConfig
+
+# 创建模型配置
+model_config = ModelConfig(
+    base_url="https://open.bigmodel.cn/api/paas/v4",
+    model_name="autoglm-phone-9b",
+    api_key="your-api-key",
+)
+
+# 创建 Video Learning Agent
+agent = VideoLearningAgent(
+    model_config=model_config,
+    platform="douyin",
+    output_dir="./video_learning_data",
+)
+
+# 启动会话
+session_id = agent.start_session(
+    device_id="emulator-5554",
+    target_count=10,
+    category="美食",
+    watch_duration=3.0,
+)
+
+# 运行任务
+task = """
+在抖音上学习"美食"类视频：
+1. 打开抖音并搜索"美食"
+2. 观看视频，每个视频约3秒
+3. 记录描述、点赞数、评论数
+4. 滑动到下一个视频
+5. 重复直到观看完10个视频
+"""
+
+success = agent.run_learning_task(task)
+
+# 导出数据
+agent.export_data("json")
+agent.export_data("csv")
+```
+
+## API 端点
+
+### 创建会话
+```http
+POST /api/video-learning/sessions
+Content-Type: application/json
+
+{
+    "device_id": "emulator-5554",
+    "platform": "douyin",
+    "target_count": 10,
+    "category": "美食",
+    "watch_duration": 3.0
+}
+```
+
+### 启动会话
+```http
+POST /api/video-learning/sessions/{session_id}/start
+```
+
+### 控制会话
+```http
+POST /api/video-learning/sessions/{session_id}/control
+Content-Type: application/json
+
+{
+    "action": "pause"  // pause, resume, stop
+}
+```
+
+### 获取会话状态
+```http
+GET /api/video-learning/sessions/{session_id}/status
+```
+
+### 获取会话视频列表
+```http
+GET /api/video-learning/sessions/{session_id}/videos
+```
+
+## 数据结构
+
+### VideoRecord
+```python
+{
+    "sequence_id": 1,
+    "timestamp": "2024-01-09T10:00:00",
+    "screenshot_path": "./video_learning_data/screenshots/...",
+    "watch_duration": 3.0,
+    "description": "视频描述文案",
+    "likes": 1000,
+    "comments": 50,
+    "tags": [],
+    "category": "美食"
+}
+```
+
+### LearningSession
+```python
+{
+    "session_id": "session_20240109_100000",
+    "start_time": "2024-01-09T10:00:00",
+    "platform": "douyin",
+    "target_category": "美食",
+    "target_count": 10,
+    "is_active": true,
+    "is_paused": false,
+    "total_videos": 10,
+    "total_duration": 30.0,
+    "records": [...]
+}
+```
+
+## 配置选项
+
+在 `.env` 文件中配置：
+
+```bash
+# 视频学习数据输出目录
+VIDEO_LEARNING_OUTPUT_DIR=./video_learning_data
+
+# 模型参数
+PHONE_AGENT_MAX_TOKENS=3000
+PHONE_AGENT_TEMPERATURE=0.0
+PHONE_AGENT_TOP_P=0.85
+PHONE_AGENT_FREQUENCY_PENALTY=0.2
+```
+
+## 后续扩展计划
+
+### 阶段 2: 高级分析
+- [ ] 视频内容特征提取
+- [ ] 常见元素识别
+- [ ] 视频风格分析
+- [ ] BGM 识别
+
+### 阶段 3: 模式学习
+- [ ] 同类视频模式归纳
+- [ ] 创作趋势分析
+- [ ] 热门元素统计
+- [ ] 最佳实践总结
+
+### 阶段 4: 创作辅助
+- [ ] 脚本生成
+- [ ] 分镜头建议
+- [ ] 拍摄指导
+- [ ] 剪辑建议
+
+## 技术架构
+
+```
+VideoLearningAgent
+├── ModelConfig (VLM 配置)
+├── LearningSession (会话管理)
+│   └── VideoRecord[] (视频记录)
+├── Callbacks (回调函数)
+│   ├── on_video_watched
+│   ├── on_progress_update
+│   └── on_session_complete
+└── PhoneAgent (底层操作)
+    ├── 视觉理解 (VLM)
+    ├── 设备控制 (ADB/HDC/iOS)
+    └── 任务执行
+```
+
+## 故障排除
+
+### 问题: 设备未连接
+- 确保 ADB/HDC 服务正在运行
+- 检查设备是否通过 USB 连接
+- 尝试点击 "Refresh" 按钮
+
+### 问题: 任务无法启动
+- 检查模型 API 配置
+- 确保 `.env` 文件正确配置
+- 查看 Dashboard 控制台日志
+
+### 问题: 视频信息未采集
+- 确保 VLM 模型正常工作
+- 检查网络连接
+- 增加观看时长
+
+## 许可证
+
+MIT License
--- a/examples/video_learning_demo.py
+++ b/examples/video_learning_demo.py
@@ -0,0 +1,161 @@
+"""
+Video Learning Agent Demo
+
+This script demonstrates how to use the VideoLearningAgent to watch
+and learn from short video platforms like Douyin.
+
+Usage:
+    python examples/video_learning_demo.py --device-id <device_id> --count 10
+"""
+
+import os
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from phone_agent.model.client import ModelConfig
+from phone_agent.video_learning import VideoLearningAgent
+
+
+def main():
+    """Main demo function."""
+
+    # Load configuration from environment
+    base_url = os.getenv("MODEL_BASE_URL", "http://localhost:8000/v1")
+    api_key = os.getenv("MODEL_API_KEY", "your-api-key")
+    model_name = os.getenv("MODEL_NAME", "autoglm-phone-9b")
+
+    # Configuration
+    device_id = os.getenv("DEVICE_ID", "emulator-5554")
+    target_count = int(os.getenv("TARGET_COUNT", "10"))
+    watch_duration = float(os.getenv("WATCH_DURATION", "3.0"))
+    category = os.getenv("CATEGORY", None)  # e.g., "美食", "旅行", "搞笑"
+
+    print("=" * 60)
+    print("Video Learning Agent Demo")
+    print("=" * 60)
+    print(f"Device: {device_id}")
+    print(f"Platform: Douyin")
+    print(f"Target videos: {target_count}")
+    print(f"Watch duration: {watch_duration}s per video")
+    if category:
+        print(f"Category filter: {category}")
+    print("=" * 60)
+
+    # Create agent
+    model_config = ModelConfig(
+        base_url=base_url,
+        model_name=model_name,
+        api_key=api_key,
+        lang="cn",
+    )
+
+    agent = VideoLearningAgent(
+        model_config=model_config,
+        platform="douyin",
+        output_dir="./video_learning_data",
+    )
+
+    # Setup callbacks
+    def on_video_watched(record):
+        print(f"\n[Video {record.sequence_id}] Watched!")
+        if record.description:
+            print(f"  Description: {record.description}")
+        if record.likes:
+            print(f"  Likes: {record.likes}")
+        print(f"  Screenshot: {record.screenshot_path}")
+
+    def on_progress_update(current, total):
+        percent = (current / total * 100) if total > 0 else 0
+        print(f"\nProgress: {current}/{total} ({percent:.1f}%)")
+
+    def on_session_complete(session):
+        print("\n" + "=" * 60)
+        print("Session Complete!")
+        print("=" * 60)
+        print(f"Total videos watched: {session.total_videos}")
+        print(f"Total duration: {session.total_duration:.1f}s")
+        print(f"Data saved to: ./video_learning_data/{session.session_id}.json")
+
+    agent.on_video_watched = on_video_watched
+    agent.on_progress_update = on_progress_update
+    agent.on_session_complete = on_session_complete
+
+    # Start session
+    session_id = agent.start_session(
+        device_id=device_id,
+        target_count=target_count,
+        category=category,
+        watch_duration=watch_duration,
+    )
+
+    print(f"\nSession started: {session_id}")
+    print("Starting video watching task...\n")
+
+    # Construct the task
+    if category:
+        task = f"""
+请帮我学习抖音上的"{category}"类视频。具体任务如下：
+
+1. 打开抖音应用
+2. 搜索"{category}"
+3. 开始观看视频，每个视频观看约{watch_duration}秒
+4. 记录每个视频的描述、点赞数、评论数等信息
+5. 滑动到下一个视频
+6. 重复步骤3-5，直到观看完{target_count}个视频
+
+请按照以下格式记录每个视频：
+- 视频序号
+- 描述文案（屏幕上的文字）
+- 点赞数（如果有显示）
+- 评论数（如果有显示）
+- 截图
+
+每个视频观看时，请等待{watch_duration}秒后再滑动到下一个。
+        """
+    else:
+        task = f"""
+请帮我学习抖音上的推荐视频。具体任务如下：
+
+1. 打开抖音应用
+2. 在推荐页开始观看视频，每个视频观看约{watch_duration}秒
+3. 记录每个视频的描述、点赞数、评论数等信息
+4. 向上滑动到下一个视频
+5. 重复步骤3-4，直到观看完{target_count}个视频
+
+请按照以下格式记录每个视频：
+- 视频序号
+- 描述文案（屏幕上的文字）
+- 点赞数（如果有显示）
+- 评论数（如果有显示）
+- 截图
+
+每个视频观看时，请等待{watch_duration}秒后再滑动到下一个。
+        """
+
+    # Run the task
+    success = agent.run_learning_task(task)
+
+    if success:
+        print("\n✓ Learning task completed successfully!")
+
+        # Export data
+        json_file = agent.export_data("json")
+        print(f"✓ Data exported to: {json_file}")
+
+        csv_file = agent.export_data("csv")
+        print(f"✓ Data exported to: {csv_file}")
+
+    else:
+        print("\n✗ Learning task failed")
+
+    print("\nSession progress:")
+    progress = agent.get_session_progress()
+    for key, value in progress.items():
+        print(f"  {key}: {value}")
+
+
+if __name__ == "__main__":
+    main()
--- a/phone_agent/init.py
+++ b/phone_agent/init.py
@@ -7,6 +7,7 @@ using AI models for visual understanding and decision making.

 from phone_agent.agent import AgentConfig, PhoneAgent, StepResult
 from phone_agent.agent_ios import IOSAgentConfig, IOSPhoneAgent
+from phone_agent.video_learning import VideoLearningAgent, VideoRecord, LearningSession

 __version__ = "0.1.0"
 __all__ = [
@@ -15,4 +16,7 @@ __all__ = [
    "AgentConfig",
    "IOSAgentConfig",
    "StepResult",
+    "VideoLearningAgent",
+    "VideoRecord",
+    "LearningSession",
 ]
--- a/phone_agent/video_learning.py
+++ b/phone_agent/video_learning.py
@@ -0,0 +1,561 @@
+"""
+Video Learning Agent for AutoGLM
+
+This agent learns from short video platforms (like Douyin/TikTok)
+by watching videos and collecting information.
+
+MVP Features:
+- Automatic video scrolling
+- Play/Pause control
+- Screenshot capture for each video
+- Basic data collection (likes, comments, etc.)
+"""
+
+import hashlib
+import json
+import os
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Any
+
+from phone_agent import PhoneAgent, AgentConfig
+from phone_agent.agent import StepResult
+from phone_agent.model.client import ModelConfig
+from phone_agent.device_factory import get_device_factory
+
+
+@dataclass
+class VideoRecord:
+    """Record of a watched video."""
+
+    sequence_id: int
+    timestamp: str
+    screenshot_path: Optional[str] = None
+    watch_duration: float = 0.0  # seconds
+
+    # Basic info (extracted via OCR/analysis)
+    description: Optional[str] = None  # Video caption/text
+    likes: Optional[int] = None
+    comments: Optional[int] = None
+    shares: Optional[int] = None
+
+    # Content analysis (for future expansion)
+    tags: List[str] = field(default_factory=list)
+    category: Optional[str] = None
+    elements: List[str] = field(default_factory=list)
+
+    # Metadata
+    position_in_session: int = 0
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "sequence_id": self.sequence_id,
+            "timestamp": self.timestamp,
+            "screenshot_path": self.screenshot_path,
+            "watch_duration": self.watch_duration,
+            "description": self.description,
+            "likes": self.likes,
+            "comments": self.comments,
+            "shares": self.shares,
+            "tags": self.tags,
+            "category": self.category,
+            "elements": self.elements,
+            "position_in_session": self.position_in_session,
+        }
+
+
+@dataclass
+class LearningSession:
+    """A learning session with multiple videos."""
+
+    session_id: str
+    start_time: str
+    platform: str  # "douyin", "tiktok", etc.
+    target_category: Optional[str] = None
+    target_count: int = 10
+    records: List[VideoRecord] = field(default_factory=list)
+
+    # Control flags
+    is_active: bool = True
+    is_paused: bool = False
+
+    # Statistics
+    total_videos: int = 0
+    total_duration: float = 0.0
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return {
+            "session_id": self.session_id,
+            "start_time": self.start_time,
+            "platform": self.platform,
+            "target_category": self.target_category,
+            "target_count": self.target_count,
+            "is_active": self.is_active,
+            "is_paused": self.is_paused,
+            "total_videos": self.total_videos,
+            "total_duration": self.total_duration,
+            "records": [r.to_dict() for r in self.records],
+        }
+
+
+class VideoLearningAgent:
+    """
+    Agent for learning from short video platforms.
+
+    MVP Capabilities:
+    - Navigate to video platform
+    - Watch videos automatically
+    - Capture screenshots
+    - Collect basic information
+    - Export learning data
+    """
+
+    # Platform-specific configurations
+    PLATFORM_CONFIGS = {
+        "douyin": {
+            "package_name": "com.ss.android.ugc.aweme",
+            "activity_hint": "aweme",
+            "scroll_gesture": "up",
+            "like_position": {"x": 0.9, "y": 0.8},  # Relative coordinates
+            "comment_position": {"x": 0.9, "y": 0.7},
+        },
+        "kuaishou": {
+            "package_name": "com.smile.gifmaker",
+            "activity_hint": "gifmaker",
+            "scroll_gesture": "up",
+            "like_position": {"x": 0.9, "y": 0.8},
+        },
+        "tiktok": {
+            "package_name": "com.zhiliaoapp.musically",
+            "activity_hint": "musically",
+            "scroll_gesture": "up",
+            "like_position": {"x": 0.9, "y": 0.8},
+        },
+    }
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        platform: str = "douyin",
+        output_dir: str = "./video_learning_data",
+    ):
+        """
+        Initialize Video Learning Agent.
+
+        Args:
+            model_config: Model configuration for VLM
+            platform: Platform name (douyin, kuaishou, tiktok)
+            output_dir: Directory to save screenshots and data
+        """
+        self.model_config = model_config
+        self.platform = platform
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create screenshots subdirectory
+        self.screenshot_dir = self.output_dir / "screenshots"
+        self.screenshot_dir.mkdir(exist_ok=True)
+
+        # Current session
+        self.current_session: Optional[LearningSession] = None
+        self.video_counter = 0
+
+        # Agent will be created when starting a session
+        self.agent: Optional[PhoneAgent] = None
+
+        # Callbacks for external control
+        self.on_video_watched: Optional[Callable[[VideoRecord], None]] = None
+        self.on_session_complete: Optional[Callable[[LearningSession], None]] = None
+        self.on_progress_update: Optional[Callable[[int, int], None]] = None
+
+        # Video detection: track screenshot changes (simplified)
+        self._last_screenshot_hash: Optional[str] = None
+
+    def start_session(
+        self,
+        device_id: str,
+        target_count: int = 10,
+        category: Optional[str] = None,
+        watch_duration: float = 3.0,
+        max_steps: int = 500,
+    ) -> str:
+        """
+        Start a learning session.
+
+        Args:
+            device_id: Target device ID
+            target_count: Number of videos to watch
+            category: Target category (e.g., "美食", "旅行")
+            watch_duration: How long to watch each video (seconds)
+            max_steps: Maximum execution steps
+
+        Returns:
+            Session ID
+        """
+        # Create new session
+        session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        self.current_session = LearningSession(
+            session_id=session_id,
+            start_time=datetime.now().isoformat(),
+            platform=self.platform,
+            target_category=category,
+            target_count=target_count,
+        )
+
+        # Configure agent with callbacks
+        agent_config = AgentConfig(
+            device_id=device_id,
+            max_steps=max_steps,
+            lang="cn",
+            step_callback=self._on_step,
+            before_action_callback=self._before_action,
+        )
+
+        # Create phone agent
+        self.agent = PhoneAgent(
+            model_config=self.model_config,
+            agent_config=agent_config,
+        )
+
+        # Store parameters for the task
+        self._watch_duration = watch_duration
+        self._device_id = device_id
+
+        # Reset video detection tracking (simplified)
+        self._last_screenshot_hash = None
+        self.video_counter = 0
+
+        return session_id
+
+    def run_learning_task(self, task: str) -> bool:
+        """
+        Run the learning task.
+
+        Args:
+            task: Natural language task description
+
+        Returns:
+            True if successful
+        """
+        if not self.agent or not self.current_session:
+            raise RuntimeError("Session not started. Call start_session() first.")
+
+        try:
+            result = self.agent.run(task)
+            # Mark session as inactive after task completes
+            if self.current_session:
+                self.current_session.is_active = False
+                self._save_session()
+                print(f"[VideoLearning] Session completed. Recorded {self.video_counter} videos.")
+            return bool(result)
+        except Exception as e:
+            print(f"Error during learning: {e}")
+            if self.current_session:
+                self.current_session.is_active = False
+            return False
+
+    def stop_session(self):
+        """Stop the current learning session."""
+        if self.current_session:
+            self.current_session.is_active = False
+
+        if self.agent:
+            # Agent will stop on next callback check
+            pass
+
+    def pause_session(self):
+        """Pause the current session (can be resumed)."""
+        if self.current_session:
+            self.current_session.is_paused = True
+
+    def resume_session(self):
+        """Resume a paused session."""
+        if self.current_session:
+            self.current_session.is_paused = False
+
+    def _on_step(self, result: StepResult) -> Optional[str]:
+        """
+        Callback after each step.
+
+        Simplified logic:
+        1. Check if we're in the target app using get_current_app()
+        2. Detect screenshot changes
+        3. Record video when screenshot changes
+
+        Args:
+            result: Step execution result
+
+        Returns:
+            "stop" to end session, new task to switch, None to continue
+        """
+        if not self.current_session:
+            return None
+
+        # Check if session should stop
+        if not self.current_session.is_active:
+            self._save_session()
+            if self.on_session_complete:
+                self.on_session_complete(self.current_session)
+            return "stop"
+
+        # Check if paused
+        if self.current_session.is_paused:
+            return None
+
+        # Check if we've watched enough videos
+        if self.video_counter >= self.current_session.target_count:
+            self.current_session.is_active = False
+            self._save_session()
+            if self.on_session_complete:
+                self.on_session_complete(self.current_session)
+            return "stop"
+
+        try:
+            # Use get_current_app() to detect if we're in target app
+            current_app = get_device_factory().get_current_app(self._device_id)
+
+            # Platform-specific package names
+            platform_packages = {
+                "douyin": ["aweme", "抖音", "douyin"],
+                "kuaishou": ["gifmaker", "快手", "kuaishou"],
+                "tiktok": ["musically", "tiktok"],
+            }
+            packages = platform_packages.get(self.platform, ["aweme"])
+
+            # Check if in target app
+            is_in_target = any(pkg.lower() in current_app.lower() for pkg in packages)
+
+            if not is_in_target:
+                print(f"[VideoLearning] Not in target app: {current_app} (step {result.step_count})")
+                return None
+
+            # Get screenshot
+            screenshot = get_device_factory().get_screenshot(self._device_id)
+
+            # Use full base64 data for hash (more sensitive)
+            current_hash = hashlib.md5(screenshot.base64_data.encode()).hexdigest()
+
+            # Detect screenshot change and record video
+            if self._last_screenshot_hash is None:
+                # First screenshot in target app - record first video
+                self._last_screenshot_hash = current_hash
+                self._record_video_from_screenshot(screenshot)
+                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
+
+                # Check if we've reached target after recording
+                if self.video_counter >= self.current_session.target_count:
+                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
+                    self.current_session.is_active = False
+                    self._save_session()
+                    return "stop"
+
+            elif current_hash != self._last_screenshot_hash:
+                # Screenshot changed - record new video
+                self._last_screenshot_hash = current_hash
+                self._record_video_from_screenshot(screenshot)
+                print(f"[VideoLearning] ✓ Recorded video {self.video_counter}/{self.current_session.target_count}")
+
+                # Check if we've reached target after recording
+                if self.video_counter >= self.current_session.target_count:
+                    print(f"[VideoLearning] ✓ Target reached! Stopping...")
+                    self.current_session.is_active = False
+                    self._save_session()
+                    return "stop"
+
+        except Exception as e:
+            print(f"[VideoLearning] Warning: {e}")
+
+        return None
+
+    def _record_video_from_screenshot(self, screenshot):
+        """Helper method to record video from screenshot."""
+        import base64
+        screenshot_bytes = base64.b64decode(screenshot.base64_data)
+        self.record_video(
+            screenshot=screenshot_bytes,
+            description=f"Video #{self.video_counter + 1}",
+        )
+
+    def _before_action(self, action: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Callback before executing an action.
+
+        Args:
+            action: Action to execute
+
+        Returns:
+            Modified action or None
+        """
+        # Could be used for action logging or modification
+        return None
+
+    def record_video(
+        self,
+        screenshot: Optional[bytes] = None,
+        description: Optional[str] = None,
+        likes: Optional[int] = None,
+        comments: Optional[int] = None,
+    ) -> VideoRecord:
+        """
+        Record a watched video.
+
+        Args:
+            screenshot: Screenshot image data
+            description: Video description/caption
+            likes: Number of likes
+            comments: Number of comments
+
+        Returns:
+            VideoRecord object
+        """
+        self.video_counter += 1
+
+        # Save screenshot if provided
+        screenshot_path = None
+        if screenshot:
+            screenshot_filename = f"{self.current_session.session_id}_video_{self.video_counter}.png"
+            screenshot_full_path = self.screenshot_dir / screenshot_filename
+            # Store relative path for web access: /video-learning-data/screenshots/filename.png
+            screenshot_path = f"/video-learning-data/screenshots/{screenshot_filename}"
+            with open(str(screenshot_full_path), "wb") as f:
+                f.write(screenshot)
+
+        # Create record
+        record = VideoRecord(
+            sequence_id=self.video_counter,
+            timestamp=datetime.now().isoformat(),
+            screenshot_path=screenshot_path,
+            watch_duration=self._watch_duration,
+            description=description,
+            likes=likes,
+            comments=comments,
+            position_in_session=self.video_counter,
+        )
+
+        # Add to session
+        if self.current_session:
+            self.current_session.records.append(record)
+            self.current_session.total_videos = self.video_counter
+            self.current_session.total_duration += self._watch_duration
+
+            # Notify callback
+            if self.on_video_watched:
+                self.on_video_watched(record)
+
+            # Notify progress
+            if self.on_progress_update:
+                self.on_progress_update(self.video_counter, self.current_session.target_count)
+
+        return record
+
+    def _save_session(self):
+        """Save session data to JSON file."""
+        if not self.current_session:
+            return
+
+        session_file = self.output_dir / f"{self.current_session.session_id}.json"
+        with open(session_file, "w", encoding="utf-8") as f:
+            json.dump(self.current_session.to_dict(), f, ensure_ascii=False, indent=2)
+
+        print(f"Session saved to {session_file}")
+
+    def export_data(self, format: str = "json") -> str:
+        """
+        Export session data.
+
+        Args:
+            format: Export format (json, csv)
+
+        Returns:
+            Path to exported file
+        """
+        if not self.current_session:
+            raise RuntimeError("No session to export")
+
+        if format == "json":
+            return self._export_json()
+        elif format == "csv":
+            return self._export_csv()
+        else:
+            raise ValueError(f"Unsupported format: {format}")
+
+    def _export_json(self) -> str:
+        """Export as JSON."""
+        output_file = self.output_dir / f"{self.current_session.session_id}_export.json"
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(self.current_session.to_dict(), f, ensure_ascii=False, indent=2)
+        return str(output_file)
+
+    def _export_csv(self) -> str:
+        """Export as CSV."""
+        import csv
+
+        output_file = self.output_dir / f"{self.current_session.session_id}_export.csv"
+        with open(output_file, "w", encoding="utf-8", newline="") as f:
+            if not self.current_session.records:
+                return str(output_file)
+
+            writer = csv.DictWriter(f, fieldnames=self.current_session.records[0].to_dict().keys())
+            writer.writeheader()
+            for record in self.current_session.records:
+                writer.writerow(record.to_dict())
+
+        return str(output_file)
+
+    def get_session_progress(self) -> Dict[str, Any]:
+        """Get current session progress."""
+        if not self.current_session:
+            return {"status": "no_session"}
+
+        return {
+            "session_id": self.current_session.session_id,
+            "platform": self.current_session.platform,
+            "target_count": self.current_session.target_count,
+            "watched_count": self.video_counter,
+            "progress_percent": (self.video_counter / self.current_session.target_count * 100)
+            if self.current_session.target_count > 0
+            else 0,
+            "is_active": self.current_session.is_active,
+            "is_paused": self.current_session.is_paused,
+            "total_duration": self.current_session.total_duration,
+        }
+
+
+# Convenience function for standalone usage
+def create_video_learning_agent(
+    base_url: str,
+    api_key: str,
+    model_name: str = "autoglm-phone-9b",
+    platform: str = "douyin",
+    output_dir: str = "./video_learning_data",
+    **model_kwargs,
+) -> VideoLearningAgent:
+    """
+    Create a Video Learning Agent with standard configuration.
+
+    Args:
+        base_url: Model API base URL
+        api_key: API key
+        model_name: Model name
+        platform: Platform name
+        output_dir: Output directory
+        **model_kwargs: Additional model parameters
+
+    Returns:
+        VideoLearningAgent instance
+    """
+    model_config = ModelConfig(
+        base_url=base_url,
+        model_name=model_name,
+        api_key=api_key,
+        **model_kwargs,
+    )
+
+    return VideoLearningAgent(
+        model_config=model_config,
+        platform=platform,
+        output_dir=output_dir,
+    )
--- a/scripts/run_video_learning_demo.bat
+++ b/scripts/run_video_learning_demo.bat
@@ -0,0 +1,35 @@
+@echo off
+REM Video Learning Demo Script for Windows
+REM This script starts the dashboard and opens the video learning page
+
+echo ============================================
+echo AutoGLM Video Learning Demo
+echo ============================================
+echo.
+echo Starting Dashboard...
+echo.
+
+REM Start the dashboard in background
+start "AutoGLM Dashboard" python -m uvicorn dashboard.main:app --host 0.0.0.0 --port 8080 --reload
+
+echo Waiting for dashboard to start...
+timeout /t 3 /nobreak > nul
+
+echo.
+echo Dashboard starting at: http://localhost:8080
+echo Opening Video Learning page in browser...
+echo.
+
+REM Open the video learning page
+start http://localhost:8080/static/video-learning.html
+
+echo.
+echo ============================================
+echo Video Learning Demo is ready!
+echo ============================================
+echo.
+echo Press Ctrl+C to stop the dashboard
+echo.
+
+REM Keep the script running
+pause
--- a/scripts/run_video_learning_demo.sh
+++ b/scripts/run_video_learning_demo.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Video Learning Demo Script for Linux/Mac
+# This script starts the dashboard and opens the video learning page
+
+echo "============================================"
+echo "AutoGLM Video Learning Demo"
+echo "============================================"
+echo ""
+echo "Starting Dashboard..."
+echo ""
+
+# Start the dashboard in background
+python -m uvicorn dashboard.main:app --host 0.0.0.0 --port 8080 --reload &
+DASHBOARD_PID=$!
+
+echo "Waiting for dashboard to start..."
+sleep 3
+
+echo ""
+echo "Dashboard starting at: http://localhost:8080"
+echo "Opening Video Learning page in browser..."
+echo ""
+
+# Open the video learning page
+if command -v xdg-open > /dev/null; then
+    xdg-open http://localhost:8080/static/video-learning.html
+elif command -v open > /dev/null; then
+    open http://localhost:8080/static/video-learning.html
+else
+    echo "Please open your browser and navigate to:"
+    echo "http://localhost:8080/static/video-learning.html"
+fi
+
+echo ""
+echo "============================================"
+echo "Video Learning Demo is ready!"
+echo "============================================"
+echo ""
+echo "Press Ctrl+C to stop the dashboard"
+echo ""
+
+# Wait for dashboard process
+wait $DASHBOARD_PID