Add Video Learning Agent for short video platforms

Features: - VideoLearningAgent for automated video watching on Douyin/Kuaishou/TikTok - Web dashboard UI for video learning sessions - Real-time progress tracking with screenshot capture - App detection using get_current_app() for accurate recording - Session management with pause/resume/stop controls Technical improvements: - Simplified video detection logic using direct app detection - Full base64 hash for sensitive screenshot change detection - Immediate stop when target video count is reached - Fixed circular import issues with ModelConfig Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-09 22:54:57 +08:00
parent 3552df23d6
commit 5b3f214e20
15 changed files with 2317 additions and 1 deletions
--- a/examples/video_learning_demo.py
+++ b/examples/video_learning_demo.py
@@ -0,0 +1,161 @@
+"""
+Video Learning Agent Demo
+
+This script demonstrates how to use the VideoLearningAgent to watch
+and learn from short video platforms like Douyin.
+
+Usage:
+    python examples/video_learning_demo.py --device-id <device_id> --count 10
+"""
+
+import os
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from phone_agent.model.client import ModelConfig
+from phone_agent.video_learning import VideoLearningAgent
+
+
+def main():
+    """Main demo function."""
+
+    # Load configuration from environment
+    base_url = os.getenv("MODEL_BASE_URL", "http://localhost:8000/v1")
+    api_key = os.getenv("MODEL_API_KEY", "your-api-key")
+    model_name = os.getenv("MODEL_NAME", "autoglm-phone-9b")
+
+    # Configuration
+    device_id = os.getenv("DEVICE_ID", "emulator-5554")
+    target_count = int(os.getenv("TARGET_COUNT", "10"))
+    watch_duration = float(os.getenv("WATCH_DURATION", "3.0"))
+    category = os.getenv("CATEGORY", None)  # e.g., "美食", "旅行", "搞笑"
+
+    print("=" * 60)
+    print("Video Learning Agent Demo")
+    print("=" * 60)
+    print(f"Device: {device_id}")
+    print(f"Platform: Douyin")
+    print(f"Target videos: {target_count}")
+    print(f"Watch duration: {watch_duration}s per video")
+    if category:
+        print(f"Category filter: {category}")
+    print("=" * 60)
+
+    # Create agent
+    model_config = ModelConfig(
+        base_url=base_url,
+        model_name=model_name,
+        api_key=api_key,
+        lang="cn",
+    )
+
+    agent = VideoLearningAgent(
+        model_config=model_config,
+        platform="douyin",
+        output_dir="./video_learning_data",
+    )
+
+    # Setup callbacks
+    def on_video_watched(record):
+        print(f"\n[Video {record.sequence_id}] Watched!")
+        if record.description:
+            print(f"  Description: {record.description}")
+        if record.likes:
+            print(f"  Likes: {record.likes}")
+        print(f"  Screenshot: {record.screenshot_path}")
+
+    def on_progress_update(current, total):
+        percent = (current / total * 100) if total > 0 else 0
+        print(f"\nProgress: {current}/{total} ({percent:.1f}%)")
+
+    def on_session_complete(session):
+        print("\n" + "=" * 60)
+        print("Session Complete!")
+        print("=" * 60)
+        print(f"Total videos watched: {session.total_videos}")
+        print(f"Total duration: {session.total_duration:.1f}s")
+        print(f"Data saved to: ./video_learning_data/{session.session_id}.json")
+
+    agent.on_video_watched = on_video_watched
+    agent.on_progress_update = on_progress_update
+    agent.on_session_complete = on_session_complete
+
+    # Start session
+    session_id = agent.start_session(
+        device_id=device_id,
+        target_count=target_count,
+        category=category,
+        watch_duration=watch_duration,
+    )
+
+    print(f"\nSession started: {session_id}")
+    print("Starting video watching task...\n")
+
+    # Construct the task
+    if category:
+        task = f"""
+请帮我学习抖音上的"{category}"类视频。具体任务如下：
+
+1. 打开抖音应用
+2. 搜索"{category}"
+3. 开始观看视频，每个视频观看约{watch_duration}秒
+4. 记录每个视频的描述、点赞数、评论数等信息
+5. 滑动到下一个视频
+6. 重复步骤3-5，直到观看完{target_count}个视频
+
+请按照以下格式记录每个视频：
+- 视频序号
+- 描述文案（屏幕上的文字）
+- 点赞数（如果有显示）
+- 评论数（如果有显示）
+- 截图
+
+每个视频观看时，请等待{watch_duration}秒后再滑动到下一个。
+        """
+    else:
+        task = f"""
+请帮我学习抖音上的推荐视频。具体任务如下：
+
+1. 打开抖音应用
+2. 在推荐页开始观看视频，每个视频观看约{watch_duration}秒
+3. 记录每个视频的描述、点赞数、评论数等信息
+4. 向上滑动到下一个视频
+5. 重复步骤3-4，直到观看完{target_count}个视频
+
+请按照以下格式记录每个视频：
+- 视频序号
+- 描述文案（屏幕上的文字）
+- 点赞数（如果有显示）
+- 评论数（如果有显示）
+- 截图
+
+每个视频观看时，请等待{watch_duration}秒后再滑动到下一个。
+        """
+
+    # Run the task
+    success = agent.run_learning_task(task)
+
+    if success:
+        print("\n✓ Learning task completed successfully!")
+
+        # Export data
+        json_file = agent.export_data("json")
+        print(f"✓ Data exported to: {json_file}")
+
+        csv_file = agent.export_data("csv")
+        print(f"✓ Data exported to: {csv_file}")
+
+    else:
+        print("\n✗ Learning task failed")
+
+    print("\nSession progress:")
+    progress = agent.get_session_progress()
+    for key, value in progress.items():
+        print(f"  {key}: {value}")
+
+
+if __name__ == "__main__":
+    main()