Merge branch 'dev_video'

2025-11-12 21:22:08 +08:00
parent 4785d1ddb5 830436d8dc
commit 9eb8e7f82d
53 changed files with 2158 additions and 438 deletions
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@

 只需输入一个 **主题**，Pixelle-Video 就能自动完成：
 - ✍️ 撰写视频文案
- 🎨 生成 AI 配图  
+- 🎨 生成 AI 配图/视频  
 - 🗣️ 合成语音解说
 - 🎵 添加背景音乐
 - 🎬 一键合成视频
@@ -32,6 +32,7 @@
 - ✅ **全自动生成** - 输入主题，自动生成完整视频
 - ✅ **AI 智能文案** - 根据主题智能创作解说词，无需自己写脚本
 - ✅ **AI 生成配图** - 每句话都配上精美的 AI 插图
+- ✅ **AI 生成视频** - 支持使用 AI 视频生成模型（如 WAN 2.1）创建动态视频内容
 - ✅ **AI 生成语音** - 支持 Edge-TTS、Index-TTS 等众多主流 TTS 方案
 - ✅ **背景音乐** - 支持添加 BGM，让视频更有氛围
 - ✅ **视觉风格** - 多种模板可选，打造独特视频风格
@@ -281,6 +282,12 @@ uv run streamlit run web/app.py
 #### 视频模板
 决定视频画面的布局和设计。

+**模板命名规范**  
+- `static_*.html`: 静态模板（无需AI生成媒体，纯文字样式）
+- `image_*.html`: 图片模板（使用AI生成的图片作为背景）
+- `video_*.html`: 视频模板（使用AI生成的视频作为背景）
+
+**使用方法**  
 - 从下拉菜单选择模板，按尺寸分组显示（竖屏/横屏/方形）
 - 点击「预览模板」可以自定义参数测试效果
 - 如果懂 HTML，可以在 `templates/` 文件夹创建自己的模板
--- a/README_EN.md
+++ b/README_EN.md
@@ -13,7 +13,7 @@

 Just input a **topic**, and Pixelle-Video will automatically:
 - ✍️ Write video script
- 🎨 Generate AI images  
+- 🎨 Generate AI images/videos  
 - 🗣️ Synthesize voice narration
 - 🎵 Add background music
 - 🎬 Create video with one click
@@ -32,6 +32,7 @@ Just input a **topic**, and Pixelle-Video will automatically:
 - ✅ **Fully Automatic Generation** - Input a topic, automatically generate complete video
 - ✅ **AI Smart Copywriting** - Intelligently create narration based on topic, no need to write scripts yourself
 - ✅ **AI Generated Images** - Each sentence comes with beautiful AI illustrations
+- ✅ **AI Generated Videos** - Support AI video generation models (like WAN 2.1) to create dynamic video content
 - ✅ **AI Generated Voice** - Support Edge-TTS, Index-TTS and many other mainstream TTS solutions
 - ✅ **Background Music** - Support adding BGM to make videos more atmospheric
 - ✅ **Visual Styles** - Multiple templates to choose from, create unique video styles
@@ -281,6 +282,12 @@ Determine what style of images AI generates.
 #### Video Template
 Determines video layout and design.

+**Template Naming Convention**  
+- `static_*.html`: Static templates (no AI-generated media, text-only styles)
+- `image_*.html`: Image templates (uses AI-generated images as background)
+- `video_*.html`: Video templates (uses AI-generated videos as background)
+
+**Usage**  
 - Select template from dropdown menu, displayed grouped by dimension (portrait/landscape/square)
 - Click "Preview Template" to test effect with custom parameters
 - If you know HTML, you can create your own templates in the `templates/` folder
--- a/api/routers/image.py
+++ b/api/routers/image.py
@@ -43,18 +43,27 @@ async def image_generate(
    try:
        logger.info(f"Image generation request: {request.prompt[:50]}...")
        
-        # Call image service
-        image_path = await pixelle_video.image(
+        # Call media service (backward compatible with image API)
+        media_result = await pixelle_video.media(
            prompt=request.prompt,
            width=request.width,
            height=request.height,
            workflow=request.workflow
        )
        
+        # For backward compatibility, only support image results in /image endpoint
+        if media_result.is_video:
+            raise HTTPException(
+                status_code=400,
+                detail="Video workflow used. Please use /media/generate endpoint for video generation."
+            )
+        
        return ImageGenerateResponse(
-            image_path=image_path
+            image_path=media_result.url
        )
        
+    except HTTPException:
+        raise
    except Exception as e:
        logger.error(f"Image generation error: {e}")
        raise HTTPException(status_code=500, detail=str(e))
--- a/api/routers/video.py
+++ b/api/routers/video.py
@@ -73,8 +73,7 @@ async def generate_video_sync(
            "max_narration_words": request_body.max_narration_words,
            "min_image_prompt_words": request_body.min_image_prompt_words,
            "max_image_prompt_words": request_body.max_image_prompt_words,
-            "image_width": request_body.image_width,
-            "image_height": request_body.image_height,
+            # Note: image_width and image_height are now auto-determined from template
            "image_workflow": request_body.image_workflow,
            "video_fps": request_body.video_fps,
            "frame_template": request_body.frame_template,
@@ -161,8 +160,7 @@ async def generate_video_async(
                "max_narration_words": request_body.max_narration_words,
                "min_image_prompt_words": request_body.min_image_prompt_words,
                "max_image_prompt_words": request_body.max_image_prompt_words,
-                "image_width": request_body.image_width,
-                "image_height": request_body.image_height,
+                # Note: image_width and image_height are now auto-determined from template
                "image_workflow": request_body.image_workflow,
                "video_fps": request_body.video_fps,
                "frame_template": request_body.frame_template,
--- a/api/schemas/video.py
+++ b/api/schemas/video.py
@@ -57,8 +57,7 @@ class VideoGenerateRequest(BaseModel):
    max_image_prompt_words: int = Field(60, ge=10, le=200, description="Max image prompt words")
    
    # === Image Parameters ===
-    image_width: int = Field(1024, description="Image width")
-    image_height: int = Field(1024, description="Image height")
+    # Note: image_width and image_height are now auto-determined from template meta tags
    image_workflow: Optional[str] = Field(None, description="Custom image workflow")
    
    # === Video Parameters ===
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -37,15 +37,29 @@ comfyui:
    
    # Image prompt prefix (optional)
    prompt_prefix: "Minimalist black-and-white matchstick figure style illustration, clean lines, simple sketch style"
+  
+  # Video-specific configuration
+  video:
+    # Required: Default workflow to use (no fallback)
+    # Options: runninghub/video_wan2.1_fusionx.json (recommended, no local setup)
+    #          selfhost/video_wan2.1_fusionx.json (requires local ComfyUI)
+    default_workflow: runninghub/video_wan2.1_fusionx.json
+    
+    # Video prompt prefix (optional)
+    prompt_prefix: "Minimalist black-and-white matchstick figure style illustration, clean lines, simple sketch style"

 # ==================== Template Configuration ====================
 # Configure default template for video generation
 template:
  # Default frame template to use when not explicitly specified
  # Determines video aspect ratio and layout style
+  # Template naming convention:
+  #   - static_*.html: Static style templates (no AI-generated media)
+  #   - image_*.html: Templates requiring AI-generated images
+  #   - video_*.html: Templates requiring AI-generated videos
  # Options: 
-  #   - 1080x1920 (vertical/portrait): default.html, modern.html, elegant.html, etc.
-  #   - 1080x1080 (square): minimal_framed.html, magazine_cover.html, etc.
-  #   - 1920x1080 (horizontal/landscape): film.html, full.html, etc.
+  #   - 1080x1920 (vertical/portrait): image_default.html, image_modern.html, image_elegant.html, static_simple.html, etc.
+  #   - 1080x1080 (square): image_minimal_framed.html, etc.
+  #   - 1920x1080 (horizontal/landscape): image_film.html, image_full.html, etc.
  # See templates/ directory for all available templates
-  default_template: "1080x1920/default.html"
+  default_template: "1080x1920/image_default.html"
--- a/docs/en/reference/config-schema.md
+++ b/docs/en/reference/config-schema.md
@@ -21,6 +21,10 @@ comfyui:
    default_workflow: "runninghub/image_flux.json"
    prompt_prefix: "Minimalist illustration style"
  
+  video:
+    default_workflow: "runninghub/video_wan2.1_fusionx.json"
+    prompt_prefix: "Minimalist illustration style"
+  
  tts:
    default_workflow: "selfhost/tts_edge.json"
 ```
@@ -48,6 +52,13 @@ comfyui:
 - `default_workflow`: Default image generation workflow
 - `prompt_prefix`: Prompt prefix

+### Video Configuration
+
+- `default_workflow`: Default video generation workflow
+  - `runninghub/video_wan2.1_fusionx.json`: Cloud workflow (recommended, no local setup required)
+  - `selfhost/video_wan2.1_fusionx.json`: Local workflow (requires local ComfyUI support)
+- `prompt_prefix`: Video prompt prefix (controls video generation style)
+
 ### TTS Configuration

 - `default_workflow`: Default TTS workflow
--- a/docs/en/user-guide/templates.md
+++ b/docs/en/user-guide/templates.md
@@ -154,15 +154,39 @@ Suitable for Instagram, WeChat Moments, and other platforms.

 ---

+## Template Naming Convention
+
+Templates follow a unified naming convention to distinguish different types:
+
+- **`static_*.html`**: Static templates
+  - No AI-generated media content required
+  - Pure text style rendering
+  - Suitable for quick generation and low-cost scenarios
+
+- **`image_*.html`**: Image templates
+  - Uses AI-generated images as background
+  - Invokes ComfyUI image generation workflows
+  - Suitable for content requiring visual illustrations
+
+- **`video_*.html`**: Video templates
+  - Uses AI-generated videos as background
+  - Invokes ComfyUI video generation workflows
+  - Creates dynamic video content with enhanced expressiveness
+
 ## Template Structure

 Templates are located in the `templates/` directory, grouped by size:

 ```
 templates/
-├── 1080x1920/  # Portrait (11 templates)
-├── 1920x1080/  # Landscape (2 templates)
-└── 1080x1080/  # Square (1 template)
+├── 1080x1920/  # Portrait
+│   ├── static_*.html   # Static templates
+│   ├── image_*.html    # Image templates
+│   └── video_*.html    # Video templates
+├── 1920x1080/  # Landscape
+│   └── image_*.html    # Image templates
+└── 1080x1080/  # Square
+    └── image_*.html    # Image templates
 ```

 ---
--- a/docs/en/user-guide/workflows.md
+++ b/docs/en/user-guide/workflows.md
@@ -16,10 +16,42 @@ Pixelle-Video is built on the ComfyUI architecture and supports custom workflows

 Located in `workflows/selfhost/` or `workflows/runninghub/`

+Used for Text-to-Speech, supporting various TTS engines:
+- Edge-TTS
+- Index-TTS (supports voice cloning)
+- Other ComfyUI-compatible TTS nodes
+
 ### Image Generation Workflows

 Located in `workflows/selfhost/` or `workflows/runninghub/`

+Used for generating static images as video backgrounds:
+- FLUX series models
+- Stable Diffusion series models
+- Other image generation models
+
+### Video Generation Workflows
+
+Located in `workflows/selfhost/` or `workflows/runninghub/`
+
+**New Feature**: Supports AI video generation to create dynamic video content.
+
+**Preset Workflows**:
+- `runninghub/video_wan2.1_fusionx.json`: Cloud workflow (recommended)
+  - Based on WAN 2.1 model
+  - No local setup required, accessed via RunningHub API
+  - Supports Text-to-Video generation
+  
+- `selfhost/video_wan2.1_fusionx.json`: Local workflow
+  - Requires local ComfyUI environment
+  - Requires installation of corresponding video generation nodes
+  - Suitable for users with local GPU
+
+**Use Cases**:
+- Works with `video_*.html` templates
+- Automatically generates dynamic video backgrounds based on scripts
+- Enhances visual expressiveness and viewing experience
+
 ---

 ## Custom Workflows
--- a/docs/zh/reference/config-schema.md
+++ b/docs/zh/reference/config-schema.md
@@ -21,6 +21,10 @@ comfyui:
    default_workflow: "runninghub/image_flux.json"
    prompt_prefix: "Minimalist illustration style"
  
+  video:
+    default_workflow: "runninghub/video_wan2.1_fusionx.json"
+    prompt_prefix: "Minimalist illustration style"
+  
  tts:
    default_workflow: "selfhost/tts_edge.json"
 ```
@@ -48,6 +52,13 @@ comfyui:
 - `default_workflow`: 默认图像生成工作流
 - `prompt_prefix`: 提示词前缀

+### 视频配置
+
+- `default_workflow`: 默认视频生成工作流
+  - `runninghub/video_wan2.1_fusionx.json`: 云端工作流（推荐，无需本地环境）
+  - `selfhost/video_wan2.1_fusionx.json`: 本地工作流（需要本地 ComfyUI 支持）
+- `prompt_prefix`: 视频提示词前缀（用于控制视频生成风格）
+
 ### TTS 配置

 - `default_workflow`: 默认 TTS 工作流
--- a/docs/zh/user-guide/templates.md
+++ b/docs/zh/user-guide/templates.md
@@ -154,15 +154,39 @@

 ---

+## 模板命名规范
+
+模板采用统一的命名规范来区分不同类型：
+
+- **`static_*.html`**: 静态模板
+  - 无需 AI 生成任何媒体内容
+  - 纯文字样式渲染
+  - 适合快速生成、低成本场景
+
+- **`image_*.html`**: 图片模板
+  - 使用 AI 生成的图片作为背景
+  - 调用 ComfyUI 的图像生成工作流
+  - 适合需要视觉配图的内容
+
+- **`video_*.html`**: 视频模板
+  - 使用 AI 生成的视频作为背景
+  - 调用 ComfyUI 的视频生成工作流
+  - 创建动态视频内容，增强表现力
+
 ## 模板结构

 模板位于 `templates/` 目录，按尺寸分组：

 ```
 templates/
-├── 1080x1920/  # 竖屏（11个模板）
-├── 1920x1080/  # 横屏（2个模板）
-└── 1080x1080/  # 方形（1个模板）
+├── 1080x1920/  # 竖屏
+│   ├── static_*.html   # 静态模板
+│   ├── image_*.html    # 图片模板
+│   └── video_*.html    # 视频模板
+├── 1920x1080/  # 横屏
+│   └── image_*.html    # 图片模板
+└── 1080x1080/  # 方形
+    └── image_*.html    # 图片模板
 ```

 ---
--- a/docs/zh/user-guide/workflows.md
+++ b/docs/zh/user-guide/workflows.md
@@ -16,10 +16,42 @@ Pixelle-Video 基于 ComfyUI 架构，支持自定义工作流。

 位于 `workflows/selfhost/` 或 `workflows/runninghub/`

+用于文本转语音（Text-to-Speech），支持多种 TTS 引擎：
+- Edge-TTS
+- Index-TTS（支持声音克隆）
+- 其他 ComfyUI 兼容的 TTS 节点
+
 ### 图像生成工作流

 位于 `workflows/selfhost/` 或 `workflows/runninghub/`

+用于生成静态图像作为视频背景：
+- FLUX 系列模型
+- Stable Diffusion 系列模型
+- 其他图像生成模型
+
+### 视频生成工作流
+
+位于 `workflows/selfhost/` 或 `workflows/runninghub/`
+
+**新功能**：支持 AI 视频生成，创建动态视频内容。
+
+**预置工作流**：
+- `runninghub/video_wan2.1_fusionx.json`: 云端工作流（推荐）
+  - 基于 WAN 2.1 模型
+  - 无需本地环境，通过 RunningHub API 调用
+  - 支持文本到视频（Text-to-Video）
+  
+- `selfhost/video_wan2.1_fusionx.json`: 本地工作流
+  - 需要本地 ComfyUI 环境
+  - 需要安装相应的视频生成节点
+  - 适合有本地 GPU 的用户
+
+**使用场景**：
+- 配合 `video_*.html` 模板使用
+- 自动根据文案生成动态视频背景
+- 增强视频的视觉表现力和观看体验
+
 ---

 ## 自定义工作流
--- a/pixelle_video/models/media.py
+++ b/pixelle_video/models/media.py
@@ -0,0 +1,61 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Media generation result models
+"""
+
+from typing import Literal, Optional
+from pydantic import BaseModel, Field
+
+
+class MediaResult(BaseModel):
+    """
+    Media generation result from workflow execution
+    
+    Supports both image and video outputs from ComfyUI workflows.
+    The media_type indicates what kind of media was generated.
+    
+    Attributes:
+        media_type: Type of media generated ("image" or "video")
+        url: URL or path to the generated media
+        duration: Duration in seconds (only for video, None for image)
+    
+    Examples:
+        # Image result
+        MediaResult(media_type="image", url="http://example.com/image.png")
+        
+        # Video result
+        MediaResult(media_type="video", url="http://example.com/video.mp4", duration=5.2)
+    """
+    
+    media_type: Literal["image", "video"] = Field(
+        description="Type of generated media"
+    )
+    url: str = Field(
+        description="URL or path to the generated media file"
+    )
+    duration: Optional[float] = Field(
+        None,
+        description="Duration in seconds (only applicable for video)"
+    )
+    
+    @property
+    def is_image(self) -> bool:
+        """Check if this is an image result"""
+        return self.media_type == "image"
+    
+    @property
+    def is_video(self) -> bool:
+        """Check if this is a video result"""
+        return self.media_type == "video"
+
--- a/pixelle_video/models/storyboard.py
+++ b/pixelle_video/models/storyboard.py
@@ -57,16 +57,18 @@ class StoryboardFrame:
    """Single storyboard frame"""
    index: int                                 # Frame index (0-based)
    narration: str                             # Narration text
-    image_prompt: str                          # Image generation prompt
+    image_prompt: str                          # Image generation prompt (can be None for text-only or video)
    
    # Generated resource paths
-    audio_path: Optional[str] = None           # Audio file path
-    image_path: Optional[str] = None           # Original image path
-    composed_image_path: Optional[str] = None  # Composed image path (with subtitles)
-    video_segment_path: Optional[str] = None   # Video segment path
+    audio_path: Optional[str] = None           # Audio file path (narration)
+    media_type: Optional[str] = None           # Media type: "image" or "video" (None if no media)
+    image_path: Optional[str] = None           # Original image path (for image type)
+    video_path: Optional[str] = None           # Original video path (for video type, before composition)
+    composed_image_path: Optional[str] = None  # Composed image path (with subtitles, for image type)
+    video_segment_path: Optional[str] = None   # Final video segment path
    
    # Metadata
-    duration: float = 0.0                      # Audio duration (seconds)
+    duration: float = 0.0                      # Frame duration (seconds, from audio or video)
    created_at: Optional[datetime] = None
    
    def __post_init__(self):
--- a/pixelle_video/pipelines/base.py
+++ b/pixelle_video/pipelines/base.py
@@ -63,8 +63,11 @@ class BasePipeline(ABC):
        # Quick access to services (convenience)
        self.llm = pixelle_video_core.llm
        self.tts = pixelle_video_core.tts
-        self.image = pixelle_video_core.image
+        self.media = pixelle_video_core.media
        self.video = pixelle_video_core.video
+        
+        # Backward compatibility alias
+        self.image = pixelle_video_core.media
    
    @abstractmethod
    async def __call__(
--- a/pixelle_video/pipelines/custom.py
+++ b/pixelle_video/pipelines/custom.py
@@ -92,8 +92,7 @@ class CustomPipeline(BasePipeline):
        ref_audio: Optional[str] = None,
        
        image_workflow: Optional[str] = None,
-        image_width: int = 1024,
-        image_height: int = 1024,
+        # Note: image_width and image_height are now auto-determined from template
        
        frame_template: Optional[str] = None,
        video_fps: int = 30,
@@ -118,9 +117,10 @@ class CustomPipeline(BasePipeline):
            VideoGenerationResult
        
        Image Generation Logic:
-            - If template has {{image}} → automatically generates images
-            - If template has no {{image}} → skips image generation (faster, cheaper)
-            - To customize: Override the template_requires_image logic in your subclass
+            - image_*.html templates → automatically generates images
+            - video_*.html templates → automatically generates videos
+            - static_*.html templates → skips media generation (faster, cheaper)
+            - To customize: Override the template type detection logic in your subclass
        """
        logger.info("Starting CustomPipeline")
        logger.info(f"Input text length: {len(text)} chars")
@@ -152,19 +152,27 @@ class CustomPipeline(BasePipeline):
            frame_template = template_config.get("default_template", "1080x1920/default.html")
        
        # ========== Step 0.5: Check template requirements ==========
-        # Detect if template requires {{image}} parameter
-        # This allows skipping the entire image generation pipeline for text-only templates
+        # Detect template type by filename prefix
+        from pathlib import Path
        from pixelle_video.services.frame_html import HTMLFrameGenerator
-        from pixelle_video.utils.template_util import resolve_template_path
+        from pixelle_video.utils.template_util import resolve_template_path, get_template_type
        
+        template_name = Path(frame_template).name
+        template_type = get_template_type(template_name)
+        template_requires_image = (template_type == "image")
+        
+        # Read media size from template meta tags
        template_path = resolve_template_path(frame_template)
        generator = HTMLFrameGenerator(template_path)
-        template_requires_image = generator.requires_image()
+        image_width, image_height = generator.get_media_size()
+        logger.info(f"📐 Media size from template: {image_width}x{image_height}")
        
-        if template_requires_image:
+        if template_type == "image":
            logger.info(f"📸 Template requires image generation")
-        else:
-            logger.info(f"⚡ Template does not require images - skipping image generation pipeline")
+        elif template_type == "video":
+            logger.info(f"🎬 Template requires video generation")
+        else:  # static
+            logger.info(f"⚡ Static template - skipping media generation pipeline")
            logger.info(f"   💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency")
        
        # ========== Step 1: Process content (CUSTOMIZE THIS) ==========
@@ -194,8 +202,8 @@ class CustomPipeline(BasePipeline):
        # ========== Step 2: Generate image prompts (CONDITIONAL - CUSTOMIZE THIS) ==========
        self._report_progress(progress_callback, "generating_image_prompts", 0.25)
        
-        # IMPORTANT: Check if template actually needs images
-        # If your template doesn't use {{image}}, you can skip this entire step!
+        # IMPORTANT: Check if template is image type
+        # If your template is static_*.html, you can skip this entire step!
        if template_requires_image:
            # Template requires images - generate image prompts using LLM
            from pixelle_video.utils.content_generators import generate_image_prompts
--- a/pixelle_video/pipelines/standard.py
+++ b/pixelle_video/pipelines/standard.py
@@ -94,8 +94,7 @@ class StandardPipeline(BasePipeline):
        max_image_prompt_words: int = 60,
        
        # === Image Parameters ===
-        image_width: int = 1024,
-        image_height: int = 1024,
+        # Note: image_width and image_height are now auto-determined from template meta tags
        image_workflow: Optional[str] = None,
        
        # === Video Parameters ===
@@ -151,9 +150,8 @@ class StandardPipeline(BasePipeline):
            min_image_prompt_words: Min image prompt length
            max_image_prompt_words: Max image prompt length
            
-            image_width: Generated image width (default 1024)
-            image_height: Generated image height (default 1024)
            image_workflow: Image workflow filename (e.g., "image_flux.json", None = use default)
+                           Note: Image/video size is now auto-determined from template meta tags
            
            video_fps: Video frame rate (default 30)
            
@@ -239,6 +237,16 @@ class StandardPipeline(BasePipeline):
            template_config = self.core.config.get("template", {})
            frame_template = template_config.get("default_template", "1080x1920/default.html")
        
+        # Read media size from template meta tags
+        from pixelle_video.services.frame_html import HTMLFrameGenerator
+        from pixelle_video.utils.template_util import resolve_template_path
+        
+        template_path = resolve_template_path(frame_template)
+        temp_generator = HTMLFrameGenerator(template_path)
+        image_width, image_height = temp_generator.get_media_size()
+        
+        logger.info(f"📐 Media size from template: {image_width}x{image_height}")
+        
        # Create storyboard config
        config = StoryboardConfig(
            task_id=task_id,
@@ -269,11 +277,13 @@ class StandardPipeline(BasePipeline):
        )
        
        # ========== Step 0.8: Check template requirements ==========
-        template_requires_image = self._check_template_requires_image(config.frame_template)
-        if template_requires_image:
+        template_media_type = self._check_template_media_type(config.frame_template)
+        if template_media_type == "video":
+            logger.info(f"🎬 Template requires video generation")
+        elif template_media_type == "image":
            logger.info(f"📸 Template requires image generation")
-        else:
-            logger.info(f"⚡ Template does not require images - skipping image generation pipeline")
+        else:  # static
+            logger.info(f"⚡ Static template - skipping media generation pipeline")
            logger.info(f"   💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency")
        
        try:
@@ -294,8 +304,61 @@ class StandardPipeline(BasePipeline):
                logger.info(f"✅ Split script into {len(narrations)} segments (by lines)")
                logger.info(f"   Note: n_scenes={n_scenes} is ignored in fixed mode")
            
-            # ========== Step 2: Generate image prompts (conditional) ==========
-            if template_requires_image:
+            # ========== Step 2: Generate media prompts (conditional) ==========
+            if template_media_type == "video":
+                # Video template: generate video prompts
+                self._report_progress(progress_callback, "generating_video_prompts", 0.15)
+                
+                from pixelle_video.utils.content_generators import generate_video_prompts
+                
+                # Override prompt_prefix if provided
+                original_prefix = None
+                if prompt_prefix is not None:
+                    image_config = self.core.config.get("comfyui", {}).get("image", {})
+                    original_prefix = image_config.get("prompt_prefix")
+                    image_config["prompt_prefix"] = prompt_prefix
+                    logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")
+                
+                try:
+                    # Create progress callback wrapper for video prompt generation
+                    def video_prompt_progress(completed: int, total: int, message: str):
+                        batch_progress = completed / total if total > 0 else 0
+                        overall_progress = 0.15 + (batch_progress * 0.15)
+                        self._report_progress(
+                            progress_callback,
+                            "generating_video_prompts",
+                            overall_progress,
+                            extra_info=message
+                        )
+                    
+                    # Generate base video prompts
+                    base_image_prompts = await generate_video_prompts(
+                        self.llm,
+                        narrations=narrations,
+                        min_words=min_image_prompt_words,
+                        max_words=max_image_prompt_words,
+                        progress_callback=video_prompt_progress
+                    )
+                    
+                    # Apply prompt prefix
+                    from pixelle_video.utils.prompt_helper import build_image_prompt
+                    image_config = self.core.config.get("comfyui", {}).get("image", {})
+                    prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "")
+                    
+                    image_prompts = []
+                    for base_prompt in base_image_prompts:
+                        final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use)
+                        image_prompts.append(final_prompt)
+                    
+                finally:
+                    # Restore original prompt_prefix
+                    if original_prefix is not None:
+                        image_config["prompt_prefix"] = original_prefix
+                
+                logger.info(f"✅ Generated {len(image_prompts)} video prompts")
+            
+            elif template_media_type == "image":
+                # Image template: generate image prompts
                self._report_progress(progress_callback, "generating_image_prompts", 0.15)
                
                # Override prompt_prefix if provided
@@ -343,12 +406,13 @@ class StandardPipeline(BasePipeline):
                        image_config["prompt_prefix"] = original_prefix
                
                logger.info(f"✅ Generated {len(image_prompts)} image prompts")
-            else:
-                # Skip image prompt generation
+            
+            else:  # text
+                # Text-only template: skip media prompt generation
                image_prompts = [None] * len(narrations)
                self._report_progress(progress_callback, "preparing_frames", 0.15)
-                logger.info(f"⚡ Skipped image prompt generation (template doesn't need images)")
-                logger.info(f"   💡 Savings: {len(narrations)} LLM calls + {len(narrations)} image generations")
+                logger.info(f"⚡ Skipped media prompt generation (text-only template)")
+                logger.info(f"   💡 Savings: {len(narrations)} LLM calls + {len(narrations)} media generations")
            
            # ========== Step 3: Create frames ==========
            for i, (narration, image_prompt) in enumerate(zip(narrations, image_prompts)):
@@ -452,29 +516,32 @@ class StandardPipeline(BasePipeline):
            logger.error(f"❌ Video generation failed: {e}")
            raise
    
-    def _check_template_requires_image(self, frame_template: str) -> bool:
+    def _check_template_media_type(self, frame_template: str) -> str:
        """
-        Check if template requires image generation
+        Check template media type requirement
        
        This is checked at pipeline level to avoid unnecessary:
-        - LLM calls (generating image_prompts)
-        - Image generation API calls
+        - LLM calls (generating media prompts)
+        - Media generation API calls
        - ComfyUI dependency
        
+        Template naming convention:
+        - static_*.html: Static style template (returns "static")
+        - image_*.html: Image template (returns "image")
+        - video_*.html: Video template (returns "video")
+        
        Args:
-            frame_template: Template path (e.g., "1080x1920/default.html")
+            frame_template: Template path (e.g., "1080x1920/image_default.html" or "1080x1920/video_default.html")
        
        Returns:
-            True if template contains {{image}}, False otherwise
+            "static", "image", or "video"
        """
-        from pixelle_video.services.frame_html import HTMLFrameGenerator
-        from pixelle_video.utils.template_util import resolve_template_path
+        from pixelle_video.utils.template_util import get_template_type
        
-        template_path = resolve_template_path(frame_template)
-        generator = HTMLFrameGenerator(template_path)
+        # Determine type by template filename prefix
+        template_name = Path(frame_template).name
+        template_type = get_template_type(template_name)
        
-        requires = generator.requires_image()
-        logger.debug(f"Template '{frame_template}' requires_image={requires}")
-        
-        return requires
+        logger.debug(f"Template '{frame_template}' is {template_type} template")
+        return template_type

--- a/pixelle_video/prompts/video_generation.py
+++ b/pixelle_video/prompts/video_generation.py
@@ -0,0 +1,133 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Video prompt generation template
+
+For generating video prompts from narrations.
+"""
+
+import json
+from typing import List
+
+
+VIDEO_PROMPT_GENERATION_PROMPT = """# 角色定位
+你是一个专业的视频创意设计师，擅长为视频脚本创作富有动感和表现力的视频生成提示词，将叙述内容转化为生动的视频画面。
+
+# 核心任务
+基于已有的视频脚本，为每个分镜的"旁白内容"创作对应的**英文**视频生成提示词，确保视频画面与叙述内容完美配合，通过动态画面增强观众的理解和记忆。
+
+**重要：输入包含 {narrations_count} 个旁白，你必须为每个旁白都生成一个对应的视频提示词，总共输出 {narrations_count} 个视频提示词。**
+
+# 输入内容
+{narrations_json}
+
+# 输出要求
+
+## 视频提示词规范
+- 语言：**必须使用英文**（用于 AI 视频生成模型）
+- 描述结构：scene + character action + camera movement + emotion + atmosphere
+- 描述长度：确保描述清晰完整且富有创意（建议 50-100 个英文单词）
+- 动态元素：强调动作、运动、变化等动态效果
+
+## 视觉创意要求
+- 每个视频都要准确反映对应旁白的具体内容和情感
+- 突出画面的动态性：角色动作、物体运动、镜头移动、场景转换等
+- 使用象征手法将抽象概念视觉化（如用流动的水代表时间流逝，用上升的阶梯代表进步等）
+- 画面要表现出丰富的情感和动作，增强视觉冲击力
+- 通过镜头语言（推拉摇移）和剪辑节奏增强表现力
+
+## 关键英文词汇参考
+- 动作：moving, running, flowing, transforming, growing, falling
+- 镜头：camera pan, zoom in, zoom out, tracking shot, aerial view
+- 转场：transition, fade in, fade out, dissolve
+- 氛围：dynamic, energetic, peaceful, dramatic, mysterious
+- 光影：lighting changes, shadows moving, sunlight streaming
+
+## 视频与文案配合原则
+- 视频要服务于文案，成为文案内容的视觉延伸
+- 避免与文案内容无关或矛盾的视觉元素
+- 选择最能增强文案说服力的动态表现方式
+- 确保观众能通过视频动态快速理解文案的核心观点
+
+## 创意指导
+1. **现象描述类文案**：用动态场景表现社会现象的发生过程
+2. **原因分析类文案**：用因果关系的动态演变表现内在逻辑
+3. **影响论证类文案**：用后果场景的动态展开或对比表现影响程度
+4. **深入探讨类文案**：用抽象概念的动态具象化表现深刻思考
+5. **结论启发类文案**：用开放式动态场景或指引性运动表现启发性
+
+## 视频特有注意事项
+- 强调动态：每个视频都应该包含明显的动作或运动
+- 镜头语言：适当使用推拉摇移等镜头技巧增强表现力
+- 时长考虑：视频应该是连贯的动态过程，不是静态画面
+- 流畅性：注意动作的流畅性和自然性
+
+# 输出格式
+严格按照以下JSON格式输出，**视频提示词必须是英文**：
+
+```json
+{{
+  "video_prompts": [
+    "[detailed English video prompt with dynamic elements and camera movements]",
+    "[detailed English video prompt with dynamic elements and camera movements]"
+  ]
+}}
+```
+
+# 重要提醒
+1. 只输出JSON格式内容，不要添加任何解释说明
+2. 确保JSON格式严格正确，可以被程序直接解析
+3. 输入是 {{"narrations": [旁白数组]}} 格式，输出是 {{"video_prompts": [视频提示词数组]}} 格式
+4. **输出的video_prompts数组必须恰好包含 {narrations_count} 个元素，与输入的narrations数组一一对应**
+5. **视频提示词必须使用英文**（for AI video generation models）
+6. 视频提示词必须准确反映对应旁白的具体内容和情感
+7. 每个视频都要强调动态性和运动感，避免静态描述
+8. 适当使用镜头语言增强表现力
+9. 确保视频画面能增强文案的说服力和观众的理解度
+
+现在，请为上述 {narrations_count} 个旁白创作对应的 {narrations_count} 个**英文**视频提示词。只输出JSON，不要其他内容。
+"""
+
+
+def build_video_prompt_prompt(
+    narrations: List[str],
+    min_words: int,
+    max_words: int
+) -> str:
+    """
+    Build video prompt generation prompt
+    
+    Args:
+        narrations: List of narrations
+        min_words: Minimum word count
+        max_words: Maximum word count
+    
+    Returns:
+        Formatted prompt for LLM
+    
+    Example:
+        >>> build_video_prompt_prompt(narrations, 50, 100)
+    """
+    narrations_json = json.dumps(
+        {"narrations": narrations},
+        ensure_ascii=False,
+        indent=2
+    )
+    
+    return VIDEO_PROMPT_GENERATION_PROMPT.format(
+        narrations_json=narrations_json,
+        narrations_count=len(narrations),
+        min_words=min_words,
+        max_words=max_words
+    )
+
--- a/pixelle_video/service.py
+++ b/pixelle_video/service.py
@@ -23,7 +23,7 @@ from loguru import logger
 from pixelle_video.config import config_manager
 from pixelle_video.services.llm_service import LLMService
 from pixelle_video.services.tts_service import TTSService
-from pixelle_video.services.image import ImageService
+from pixelle_video.services.media import MediaService
 from pixelle_video.services.video import VideoService
 from pixelle_video.services.frame_processor import FrameProcessor
 from pixelle_video.pipelines.standard import StandardPipeline
@@ -45,7 +45,7 @@ class PixelleVideoCore:
        # Use capabilities directly
        answer = await pixelle_video.llm("Explain atomic habits")
        audio = await pixelle_video.tts("Hello world")
-        image = await pixelle_video.image(prompt="a cat")
+        media = await pixelle_video.media(prompt="a cat")
        
        # Check active capabilities
        print(f"Using LLM: {pixelle_video.llm.active}")
@@ -56,7 +56,7 @@ class PixelleVideoCore:
          ├── config (configuration)
          ├── llm (LLM service - direct OpenAI SDK)
          ├── tts (TTS service - ComfyKit workflows)
-          ├── image (Image service - ComfyKit workflows)
+          ├── media (Media service - ComfyKit workflows, supports image & video)
          └── pipelines (video generation pipelines)
              ├── standard (standard workflow)
              ├── custom (custom workflow template)
@@ -77,7 +77,7 @@ class PixelleVideoCore:
        # Core services (initialized in initialize())
        self.llm: Optional[LLMService] = None
        self.tts: Optional[TTSService] = None
-        self.image: Optional[ImageService] = None
+        self.media: Optional[MediaService] = None
        self.video: Optional[VideoService] = None
        self.frame_processor: Optional[FrameProcessor] = None
        
@@ -105,7 +105,7 @@ class PixelleVideoCore:
        # 1. Initialize core services
        self.llm = LLMService(self.config)
        self.tts = TTSService(self.config)
-        self.image = ImageService(self.config)
+        self.media = MediaService(self.config)
        self.video = VideoService()
        self.frame_processor = FrameProcessor(self)
        
--- a/pixelle_video/services/init.py
+++ b/pixelle_video/services/init.py
@@ -18,7 +18,7 @@ Core services providing atomic capabilities.
 Services:
 - LLMService: LLM text generation
 - TTSService: Text-to-speech
- ImageService: Image generation
+- MediaService: Media generation (image & video)
 - VideoService: Video processing
 - FrameProcessor: Frame processing orchestrator
 - ComfyBaseService: Base class for ComfyUI-based services
@@ -27,15 +27,19 @@ Services:
 from pixelle_video.services.comfy_base_service import ComfyBaseService
 from pixelle_video.services.llm_service import LLMService
 from pixelle_video.services.tts_service import TTSService
-from pixelle_video.services.image import ImageService
+from pixelle_video.services.media import MediaService
 from pixelle_video.services.video import VideoService
 from pixelle_video.services.frame_processor import FrameProcessor

+# Backward compatibility alias
+ImageService = MediaService
+
 __all__ = [
    "ComfyBaseService",
    "LLMService",
    "TTSService",
-    "ImageService",
+    "MediaService",
+    "ImageService",  # Backward compatibility
    "VideoService",
    "FrameProcessor",
 ]
--- a/pixelle_video/services/frame_html.py
+++ b/pixelle_video/services/frame_html.py
@@ -77,21 +77,6 @@ class HTMLFrameGenerator:
        self._check_linux_dependencies()
        logger.debug(f"Loaded HTML template: {template_path} (size: {self.width}x{self.height})")
    
-    def requires_image(self) -> bool:
-        """
-        Detect if template requires {{image}} parameter
-        
-        This method checks if the template uses the {{image}} variable.
-        If the template doesn't use images, the entire image generation
-        pipeline can be skipped, significantly improving:
-        - Generation speed (no image generation API calls)
-        - Cost efficiency (no LLM calls for image prompts)
-        - Dependency requirements (no ComfyUI needed)
-        
-        Returns:
-            True if template contains {{image}}, False otherwise
-        """
-        return '{{image}}' in self.template
    
    def _check_linux_dependencies(self):
        """Check Linux system dependencies and warn if missing"""
@@ -141,6 +126,58 @@ class HTMLFrameGenerator:
        logger.debug(f"Template loaded: {len(content)} chars")
        return content
    
+    def _parse_media_size_from_meta(self) -> tuple[Optional[int], Optional[int]]:
+        """
+        Parse media size from meta tags in template
+        
+        Looks for meta tags:
+        - <meta name="template:media-width" content="1024">
+        - <meta name="template:media-height" content="1024">
+        
+        Returns:
+            Tuple of (width, height) or (None, None) if not found
+        """
+        from bs4 import BeautifulSoup
+        
+        try:
+            soup = BeautifulSoup(self.template, 'html.parser')
+            
+            # Find width and height meta tags
+            width_meta = soup.find('meta', attrs={'name': 'template:media-width'})
+            height_meta = soup.find('meta', attrs={'name': 'template:media-height'})
+            
+            if width_meta and height_meta:
+                width = int(width_meta.get('content', 0))
+                height = int(height_meta.get('content', 0))
+                
+                if width > 0 and height > 0:
+                    logger.debug(f"Found media size in meta tags: {width}x{height}")
+                    return width, height
+            
+            return None, None
+            
+        except Exception as e:
+            logger.warning(f"Failed to parse media size from meta tags: {e}")
+            return None, None
+    
+    def get_media_size(self) -> tuple[int, int]:
+        """
+        Get media size for image/video generation
+        
+        Returns media size specified in template meta tags.
+        
+        Returns:
+            Tuple of (width, height)
+        """
+        media_width, media_height = self._parse_media_size_from_meta()
+        
+        if media_width and media_height:
+            return media_width, media_height
+        
+        # Fallback to default if not specified (should not happen with properly configured templates)
+        logger.warning(f"No media size meta tags found in template {self.template_path}, using fallback 1024x1024")
+        return 1024, 1024
+    
    def parse_template_parameters(self) -> Dict[str, Dict[str, Any]]:
        """
        Parse custom parameters from HTML template
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -84,7 +84,7 @@ class FrameProcessor:
                ))
            await self._step_generate_audio(frame, config)
            
-            # Step 2: Generate image (conditional)
+            # Step 2: Generate media (image or video, conditional)
            if needs_image:
                if progress_callback:
                    progress_callback(ProgressEvent(
@@ -93,12 +93,13 @@ class FrameProcessor:
                        frame_current=frame_num,
                        frame_total=total_frames,
                        step=2,
-                        action="image"
+                        action="media"
                    ))
-                await self._step_generate_image(frame, config)
+                await self._step_generate_media(frame, config)
            else:
                frame.image_path = None
-                logger.debug(f"  2/4: Skipped image generation (not required by template)")
+                frame.media_type = None
+                logger.debug(f"  2/4: Skipped media generation (not required by template)")
            
            # Step 3: Compose frame (add subtitle)
            if progress_callback:
@@ -176,27 +177,66 @@ class FrameProcessor:
        
        logger.debug(f"  ✓ Audio generated: {audio_path} ({frame.duration:.2f}s)")
    
-    async def _step_generate_image(
+    async def _step_generate_media(
        self,
        frame: StoryboardFrame,
        config: StoryboardConfig
    ):
-        """Step 2: Generate image using ComfyKit"""
-        logger.debug(f"  2/4: Generating image for frame {frame.index}...")
+        """Step 2: Generate media (image or video) using ComfyKit"""
+        logger.debug(f"  2/4: Generating media for frame {frame.index}...")
        
-        # Call Image generation (with optional preset)
-        image_url = await self.core.image(
+        # Determine media type based on workflow
+        # video_ prefix in workflow name indicates video generation
+        workflow_name = config.image_workflow or ""
+        is_video_workflow = "video_" in workflow_name.lower()
+        media_type = "video" if is_video_workflow else "image"
+        
+        logger.debug(f"  → Media type: {media_type} (workflow: {workflow_name})")
+        
+        # Call Media generation (with optional preset)
+        media_result = await self.core.media(
            prompt=frame.image_prompt,
            workflow=config.image_workflow,  # Pass workflow from config (None = use default)
+            media_type=media_type,
            width=config.image_width,
            height=config.image_height
        )
        
-        # Download image to local (pass task_id)
-        local_path = await self._download_image(image_url, frame.index, config.task_id)
-        frame.image_path = local_path
+        # Store media type
+        frame.media_type = media_result.media_type
        
-        logger.debug(f"  ✓ Image generated: {local_path}")
+        if media_result.is_image:
+            # Download image to local (pass task_id)
+            local_path = await self._download_media(
+                media_result.url,
+                frame.index,
+                config.task_id,
+                media_type="image"
+            )
+            frame.image_path = local_path
+            logger.debug(f"  ✓ Image generated: {local_path}")
+        
+        elif media_result.is_video:
+            # Download video to local (pass task_id)
+            local_path = await self._download_media(
+                media_result.url,
+                frame.index,
+                config.task_id,
+                media_type="video"
+            )
+            frame.video_path = local_path
+            
+            # Update duration from video if available
+            if media_result.duration:
+                frame.duration = media_result.duration
+                logger.debug(f"  ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)")
+            else:
+                # Get video duration from file
+                frame.duration = await self._get_video_duration(local_path)
+                logger.debug(f"  ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)")
+        
+        else:
+            raise ValueError(f"Unknown media type: {media_result.media_type}")
    
    async def _step_compose_frame(
        self,
@@ -211,7 +251,9 @@ class FrameProcessor:
        from pixelle_video.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "composed")
        
-        # Use HTML template to compose frame
+        # For video type: render HTML as transparent overlay image
+        # For image type: render HTML with image background
+        # In both cases, we need the composed image
        composed_path = await self._compose_frame_html(frame, storyboard, config, output_path)
        
        frame.composed_image_path = composed_path
@@ -264,23 +306,60 @@ class FrameProcessor:
        frame: StoryboardFrame,
        config: StoryboardConfig
    ):
-        """Step 4: Create video segment from image + audio"""
+        """Step 4: Create video segment from media + audio"""
        logger.debug(f"  4/4: Creating video segment for frame {frame.index}...")
        
        # Generate output path using task_id
        from pixelle_video.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "segment")
        
-        # Call video compositor to create video from image + audio
        from pixelle_video.services.video import VideoService
        video_service = VideoService()
        
-        segment_path = video_service.create_video_from_image(
-            image=frame.composed_image_path,
-            audio=frame.audio_path,
-            output=output_path,
-            fps=config.video_fps
-        )
+        # Branch based on media type
+        if frame.media_type == "video":
+            # Video workflow: overlay HTML template on video, then add audio
+            logger.debug(f"  → Using video-based composition with HTML overlay")
+            
+            # Step 1: Overlay transparent HTML image on video
+            # The composed_image_path contains the rendered HTML with transparent background
+            temp_video_with_overlay = get_task_frame_path(config.task_id, frame.index, "video") + "_overlay.mp4"
+            
+            video_service.overlay_image_on_video(
+                video=frame.video_path,
+                overlay_image=frame.composed_image_path,
+                output=temp_video_with_overlay,
+                scale_mode="contain"  # Scale video to fit template size (contain mode)
+            )
+            
+            # Step 2: Add narration audio to the overlaid video
+            # Note: The video might have audio (replaced) or be silent (audio added)
+            segment_path = video_service.merge_audio_video(
+                video=temp_video_with_overlay,
+                audio=frame.audio_path,
+                output=output_path,
+                replace_audio=True,  # Replace video audio with narration
+                audio_volume=1.0
+            )
+            
+            # Clean up temp file
+            import os
+            if os.path.exists(temp_video_with_overlay):
+                os.unlink(temp_video_with_overlay)
+        
+        elif frame.media_type == "image" or frame.media_type is None:
+            # Image workflow: create video from image + audio
+            logger.debug(f"  → Using image-based composition")
+            
+            segment_path = video_service.create_video_from_image(
+                image=frame.composed_image_path,
+                audio=frame.audio_path,
+                output=output_path,
+                fps=config.video_fps
+            )
+        
+        else:
+            raise ValueError(f"Unknown media type: {frame.media_type}")
        
        frame.video_segment_path = segment_path
        
@@ -303,10 +382,16 @@ class FrameProcessor:
            estimated_duration = file_size / 2000
            return max(1.0, estimated_duration)  # At least 1 second
    
-    async def _download_image(self, url: str, frame_index: int, task_id: str) -> str:
-        """Download image from URL to local file"""
+    async def _download_media(
+        self,
+        url: str,
+        frame_index: int,
+        task_id: str,
+        media_type: str
+    ) -> str:
+        """Download media (image or video) from URL to local file"""
        from pixelle_video.utils.os_util import get_task_frame_path
-        output_path = get_task_frame_path(task_id, frame_index, "image")
+        output_path = get_task_frame_path(task_id, frame_index, media_type)
        
        async with httpx.AsyncClient() as client:
            response = await client.get(url)
@@ -316,4 +401,16 @@ class FrameProcessor:
                f.write(response.content)
        
        return output_path
+    
+    async def _get_video_duration(self, video_path: str) -> float:
+        """Get video duration in seconds"""
+        try:
+            import ffmpeg
+            probe = ffmpeg.probe(video_path)
+            duration = float(probe['format']['duration'])
+            return duration
+        except Exception as e:
+            logger.warning(f"Failed to get video duration: {e}, using audio duration")
+            # Fallback: use audio duration if available
+            return 1.0  # Default to 1 second if unable to determine

--- a/pixelle_video/services/image.py
+++ b/pixelle_video/services/image.py
@@ -1,192 +0,0 @@
-# Copyright (C) 2025 AIDC-AI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Image Generation Service - ComfyUI Workflow-based implementation
-"""
-
-from typing import Optional
-
-from comfykit import ComfyKit
-from loguru import logger
-
-from pixelle_video.services.comfy_base_service import ComfyBaseService
-
-
-class ImageService(ComfyBaseService):
-    """
-    Image generation service - Workflow-based
-    
-    Uses ComfyKit to execute image generation workflows.
-    
-    Usage:
-        # Use default workflow (workflows/image_flux.json)
-        image_url = await pixelle_video.image(prompt="a cat")
-        
-        # Use specific workflow
-        image_url = await pixelle_video.image(
-            prompt="a cat",
-            workflow="image_flux.json"
-        )
-        
-        # List available workflows
-        workflows = pixelle_video.image.list_workflows()
-    """
-    
-    WORKFLOW_PREFIX = "image_"
-    DEFAULT_WORKFLOW = None  # No hardcoded default, must be configured
-    WORKFLOWS_DIR = "workflows"
-    
-    def __init__(self, config: dict):
-        """
-        Initialize image service
-        
-        Args:
-            config: Full application config dict
-        """
-        super().__init__(config, service_name="image")
-    
-    async def __call__(
-        self,
-        prompt: str,
-        workflow: Optional[str] = None,
-        # ComfyUI connection (optional overrides)
-        comfyui_url: Optional[str] = None,
-        runninghub_api_key: Optional[str] = None,
-        # Common workflow parameters
-        width: Optional[int] = None,
-        height: Optional[int] = None,
-        negative_prompt: Optional[str] = None,
-        steps: Optional[int] = None,
-        seed: Optional[int] = None,
-        cfg: Optional[float] = None,
-        sampler: Optional[str] = None,
-        **params
-    ) -> str:
-        """
-        Generate image using workflow
-        
-        Args:
-            prompt: Image generation prompt
-            workflow: Workflow filename (default: from config or "image_flux.json")
-            comfyui_url: ComfyUI URL (optional, overrides config)
-            runninghub_api_key: RunningHub API key (optional, overrides config)
-            width: Image width
-            height: Image height
-            negative_prompt: Negative prompt
-            steps: Sampling steps
-            seed: Random seed
-            cfg: CFG scale
-            sampler: Sampler name
-            **params: Additional workflow parameters
-        
-        Returns:
-            Generated image URL/path
-        
-        Examples:
-            # Simplest: use default workflow (workflows/image_flux.json)
-            image_url = await pixelle_video.image(prompt="a beautiful cat")
-            
-            # Use specific workflow
-            image_url = await pixelle_video.image(
-                prompt="a cat",
-                workflow="image_flux.json"
-            )
-            
-            # With additional parameters
-            image_url = await pixelle_video.image(
-                prompt="a cat",
-                workflow="image_flux.json",
-                width=1024,
-                height=1024,
-                steps=20,
-                seed=42
-            )
-            
-            # With absolute path
-            image_url = await pixelle_video.image(
-                prompt="a cat",
-                workflow="/path/to/custom.json"
-            )
-            
-            # With custom ComfyUI server
-            image_url = await pixelle_video.image(
-                prompt="a cat",
-                comfyui_url="http://192.168.1.100:8188"
-            )
-        """
-        # 1. Resolve workflow (returns structured info)
-        workflow_info = self._resolve_workflow(workflow=workflow)
-        
-        # 2. Prepare ComfyKit config (supports both selfhost and runninghub)
-        kit_config = self._prepare_comfykit_config(
-            comfyui_url=comfyui_url,
-            runninghub_api_key=runninghub_api_key
-        )
-        
-        # 3. Build workflow parameters
-        workflow_params = {"prompt": prompt}
-        
-        # Add optional parameters
-        if width is not None:
-            workflow_params["width"] = width
-        if height is not None:
-            workflow_params["height"] = height
-        if negative_prompt is not None:
-            workflow_params["negative_prompt"] = negative_prompt
-        if steps is not None:
-            workflow_params["steps"] = steps
-        if seed is not None:
-            workflow_params["seed"] = seed
-        if cfg is not None:
-            workflow_params["cfg"] = cfg
-        if sampler is not None:
-            workflow_params["sampler"] = sampler
-        
-        # Add any additional parameters
-        workflow_params.update(params)
-        
-        logger.debug(f"Workflow parameters: {workflow_params}")
-        
-        # 4. Execute workflow (ComfyKit auto-detects based on input type)
-        try:
-            kit = ComfyKit(**kit_config)
-            
-            # Determine what to pass to ComfyKit based on source
-            if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
-                # RunningHub: pass workflow_id (ComfyKit will use runninghub backend)
-                workflow_input = workflow_info["workflow_id"]
-                logger.info(f"Executing RunningHub workflow: {workflow_input}")
-            else:
-                # Selfhost: pass file path (ComfyKit will use local ComfyUI)
-                workflow_input = workflow_info["path"]
-                logger.info(f"Executing selfhost workflow: {workflow_input}")
-            
-            result = await kit.execute(workflow_input, workflow_params)
-            
-            # 5. Handle result
-            if result.status != "completed":
-                error_msg = result.msg or "Unknown error"
-                logger.error(f"Image generation failed: {error_msg}")
-                raise Exception(f"Image generation failed: {error_msg}")
-            
-            if not result.images:
-                logger.error("No images generated")
-                raise Exception("No images generated")
-            
-            image_url = result.images[0]
-            logger.info(f"✅ Generated image: {image_url}")
-            return image_url
-        
-        except Exception as e:
-            logger.error(f"Image generation error: {e}")
-            raise
--- a/pixelle_video/services/media.py
+++ b/pixelle_video/services/media.py
@@ -0,0 +1,285 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Media Generation Service - ComfyUI Workflow-based implementation
+
+Supports both image and video generation workflows.
+Automatically detects output type based on ExecuteResult.
+"""
+
+from typing import Optional
+
+from comfykit import ComfyKit
+from loguru import logger
+
+from pixelle_video.services.comfy_base_service import ComfyBaseService
+from pixelle_video.models.media import MediaResult
+
+
+class MediaService(ComfyBaseService):
+    """
+    Media generation service - Workflow-based
+    
+    Uses ComfyKit to execute image/video generation workflows.
+    Supports both image_ and video_ workflow prefixes.
+    
+    Usage:
+        # Use default workflow (workflows/image_flux.json)
+        media = await pixelle_video.media(prompt="a cat")
+        if media.is_image:
+            print(f"Generated image: {media.url}")
+        elif media.is_video:
+            print(f"Generated video: {media.url} ({media.duration}s)")
+        
+        # Use specific workflow
+        media = await pixelle_video.media(
+            prompt="a cat",
+            workflow="image_flux.json"
+        )
+        
+        # List available workflows
+        workflows = pixelle_video.media.list_workflows()
+    """
+    
+    WORKFLOW_PREFIX = ""  # Will be overridden by _scan_workflows
+    DEFAULT_WORKFLOW = None  # No hardcoded default, must be configured
+    WORKFLOWS_DIR = "workflows"
+    
+    def __init__(self, config: dict):
+        """
+        Initialize media service
+        
+        Args:
+            config: Full application config dict
+        """
+        super().__init__(config, service_name="image")  # Keep "image" for config compatibility
+    
+    def _scan_workflows(self):
+        """
+        Scan workflows for both image_ and video_ prefixes
+        
+        Override parent method to support multiple prefixes
+        """
+        from pixelle_video.utils.os_util import list_resource_dirs, list_resource_files, get_resource_path
+        from pathlib import Path
+        
+        workflows = []
+        
+        # Get all workflow source directories
+        source_dirs = list_resource_dirs("workflows")
+        
+        if not source_dirs:
+            logger.warning("No workflow source directories found")
+            return workflows
+        
+        # Scan each source directory for workflow files
+        for source_name in source_dirs:
+            # Get all JSON files for this source
+            workflow_files = list_resource_files("workflows", source_name)
+            
+            # Filter to only files matching image_ or video_ prefix
+            matching_files = [
+                f for f in workflow_files 
+                if (f.startswith("image_") or f.startswith("video_")) and f.endswith('.json')
+            ]
+            
+            for filename in matching_files:
+                try:
+                    # Get actual file path
+                    file_path = Path(get_resource_path("workflows", source_name, filename))
+                    workflow_info = self._parse_workflow_file(file_path, source_name)
+                    workflows.append(workflow_info)
+                    logger.debug(f"Found workflow: {workflow_info['key']}")
+                except Exception as e:
+                    logger.error(f"Failed to parse workflow {source_name}/{filename}: {e}")
+        
+        # Sort by key (source/name)
+        return sorted(workflows, key=lambda w: w["key"])
+    
+    async def __call__(
+        self,
+        prompt: str,
+        workflow: Optional[str] = None,
+        # Media type specification (required for proper handling)
+        media_type: str = "image",  # "image" or "video"
+        # ComfyUI connection (optional overrides)
+        comfyui_url: Optional[str] = None,
+        runninghub_api_key: Optional[str] = None,
+        # Common workflow parameters
+        width: Optional[int] = None,
+        height: Optional[int] = None,
+        negative_prompt: Optional[str] = None,
+        steps: Optional[int] = None,
+        seed: Optional[int] = None,
+        cfg: Optional[float] = None,
+        sampler: Optional[str] = None,
+        **params
+    ) -> MediaResult:
+        """
+        Generate media (image or video) using workflow
+        
+        Media type must be specified explicitly via media_type parameter.
+        Returns a MediaResult object containing media type and URL.
+        
+        Args:
+            prompt: Media generation prompt
+            workflow: Workflow filename (default: from config or "image_flux.json")
+            media_type: Type of media to generate - "image" or "video" (default: "image")
+            comfyui_url: ComfyUI URL (optional, overrides config)
+            runninghub_api_key: RunningHub API key (optional, overrides config)
+            width: Media width
+            height: Media height
+            negative_prompt: Negative prompt
+            steps: Sampling steps
+            seed: Random seed
+            cfg: CFG scale
+            sampler: Sampler name
+            **params: Additional workflow parameters
+        
+        Returns:
+            MediaResult object with media_type ("image" or "video") and url
+        
+        Examples:
+            # Simplest: use default workflow (workflows/image_flux.json)
+            media = await pixelle_video.media(prompt="a beautiful cat")
+            if media.is_image:
+                print(f"Image: {media.url}")
+            
+            # Use specific workflow
+            media = await pixelle_video.media(
+                prompt="a cat",
+                workflow="image_flux.json"
+            )
+            
+            # Video workflow
+            media = await pixelle_video.media(
+                prompt="a cat running",
+                workflow="image_video.json"
+            )
+            if media.is_video:
+                print(f"Video: {media.url}, duration: {media.duration}s")
+            
+            # With additional parameters
+            media = await pixelle_video.media(
+                prompt="a cat",
+                workflow="image_flux.json",
+                width=1024,
+                height=1024,
+                steps=20,
+                seed=42
+            )
+            
+            # With absolute path
+            media = await pixelle_video.media(
+                prompt="a cat",
+                workflow="/path/to/custom.json"
+            )
+            
+            # With custom ComfyUI server
+            media = await pixelle_video.media(
+                prompt="a cat",
+                comfyui_url="http://192.168.1.100:8188"
+            )
+        """
+        # 1. Resolve workflow (returns structured info)
+        workflow_info = self._resolve_workflow(workflow=workflow)
+        
+        # 2. Prepare ComfyKit config (supports both selfhost and runninghub)
+        kit_config = self._prepare_comfykit_config(
+            comfyui_url=comfyui_url,
+            runninghub_api_key=runninghub_api_key
+        )
+        
+        # 3. Build workflow parameters
+        workflow_params = {"prompt": prompt}
+        
+        # Add optional parameters
+        if width is not None:
+            workflow_params["width"] = width
+        if height is not None:
+            workflow_params["height"] = height
+        if negative_prompt is not None:
+            workflow_params["negative_prompt"] = negative_prompt
+        if steps is not None:
+            workflow_params["steps"] = steps
+        if seed is not None:
+            workflow_params["seed"] = seed
+        if cfg is not None:
+            workflow_params["cfg"] = cfg
+        if sampler is not None:
+            workflow_params["sampler"] = sampler
+        
+        # Add any additional parameters
+        workflow_params.update(params)
+        
+        logger.debug(f"Workflow parameters: {workflow_params}")
+        
+        # 4. Execute workflow (ComfyKit auto-detects based on input type)
+        try:
+            kit = ComfyKit(**kit_config)
+            
+            # Determine what to pass to ComfyKit based on source
+            if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
+                # RunningHub: pass workflow_id (ComfyKit will use runninghub backend)
+                workflow_input = workflow_info["workflow_id"]
+                logger.info(f"Executing RunningHub workflow: {workflow_input}")
+            else:
+                # Selfhost: pass file path (ComfyKit will use local ComfyUI)
+                workflow_input = workflow_info["path"]
+                logger.info(f"Executing selfhost workflow: {workflow_input}")
+            
+            result = await kit.execute(workflow_input, workflow_params)
+            
+            # 5. Handle result based on specified media_type
+            if result.status != "completed":
+                error_msg = result.msg or "Unknown error"
+                logger.error(f"Media generation failed: {error_msg}")
+                raise Exception(f"Media generation failed: {error_msg}")
+            
+            # Extract media based on specified type
+            if media_type == "video":
+                # Video workflow - get video from result
+                if not result.videos:
+                    logger.error("No video generated (workflow returned no videos)")
+                    raise Exception("No video generated")
+                
+                video_url = result.videos[0]
+                logger.info(f"✅ Generated video: {video_url}")
+                
+                # Try to extract duration from result (if available)
+                duration = None
+                if hasattr(result, 'duration') and result.duration:
+                    duration = result.duration
+                
+                return MediaResult(
+                    media_type="video",
+                    url=video_url,
+                    duration=duration
+                )
+            else:  # image
+                # Image workflow - get image from result
+                if not result.images:
+                    logger.error("No image generated (workflow returned no images)")
+                    raise Exception("No image generated")
+                
+                image_url = result.images[0]
+                logger.info(f"✅ Generated image: {image_url}")
+                
+                return MediaResult(
+                    media_type="image",
+                    url=image_url
+                )
+        
+        except Exception as e:
+            logger.error(f"Media generation error: {e}")
+            raise
--- a/pixelle_video/services/video.py
+++ b/pixelle_video/services/video.py
@@ -224,20 +224,88 @@ class VideoService:
                   -map "[v]" -map "[a]" output.mp4
        """
        try:
-            inputs = [ffmpeg.input(v) for v in videos]
-            (
-                ffmpeg
-                .concat(*inputs, v=1, a=1)
-                .output(output)
-                .overwrite_output()
-                .run(capture_stdout=True, capture_stderr=True)
+            # Build filter_complex string manually
+            n = len(videos)
+            
+            # Build input stream labels: [0:v][0:a][1:v][1:a]...
+            stream_spec = "".join([f"[{i}:v][{i}:a]" for i in range(n)])
+            filter_complex = f"{stream_spec}concat=n={n}:v=1:a=1[v][a]"
+            
+            # Build ffmpeg command
+            cmd = ['ffmpeg']
+            for video in videos:
+                cmd.extend(['-i', video])
+            cmd.extend([
+                '-filter_complex', filter_complex,
+                '-map', '[v]',
+                '-map', '[a]',
+                '-y',  # Overwrite output
+                output
+            ])
+            
+            # Run command
+            import subprocess
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True
            )
+            
            logger.success(f"Videos concatenated successfully: {output}")
            return output
-        except ffmpeg.Error as e:
-            error_msg = e.stderr.decode() if e.stderr else str(e)
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr if e.stderr else str(e)
            logger.error(f"FFmpeg concat filter error: {error_msg}")
            raise RuntimeError(f"Failed to concatenate videos: {error_msg}")
+        except Exception as e:
+            logger.error(f"Concatenation error: {e}")
+            raise RuntimeError(f"Failed to concatenate videos: {e}")
+    
+    def _get_video_duration(self, video: str) -> float:
+        """Get video duration in seconds"""
+        try:
+            probe = ffmpeg.probe(video)
+            duration = float(probe['format']['duration'])
+            return duration
+        except Exception as e:
+            logger.warning(f"Failed to get video duration: {e}")
+            return 0.0
+    
+    def _get_audio_duration(self, audio: str) -> float:
+        """Get audio duration in seconds"""
+        try:
+            probe = ffmpeg.probe(audio)
+            duration = float(probe['format']['duration'])
+            return duration
+        except Exception as e:
+            logger.warning(f"Failed to get audio duration: {e}, using estimate")
+            # Fallback: estimate based on file size (very rough)
+            import os
+            file_size = os.path.getsize(audio)
+            # Assume ~16kbps for MP3, so 2KB per second
+            estimated_duration = file_size / 2000
+            return max(1.0, estimated_duration)  # At least 1 second
+    
+    def has_audio_stream(self, video: str) -> bool:
+        """
+        Check if video has audio stream
+        
+        Args:
+            video: Video file path
+        
+        Returns:
+            True if video has audio stream, False otherwise
+        """
+        try:
+            probe = ffmpeg.probe(video)
+            audio_streams = [s for s in probe.get('streams', []) if s['codec_type'] == 'audio']
+            has_audio = len(audio_streams) > 0
+            logger.debug(f"Video {video} has_audio={has_audio}")
+            return has_audio
+        except Exception as e:
+            logger.warning(f"Failed to probe video audio streams: {e}, assuming no audio")
+            return False
    
    def merge_audio_video(
        self,
@@ -247,9 +315,18 @@ class VideoService:
        replace_audio: bool = True,
        audio_volume: float = 1.0,
        video_volume: float = 0.0,
+        pad_strategy: str = "freeze",  # "freeze" (freeze last frame) or "black" (black screen)
    ) -> str:
        """
-        Merge audio with video
+        Merge audio with video, using the longer duration
+        
+        The output video duration will be the maximum of video and audio duration.
+        If audio is longer than video, the video will be padded using the specified strategy.
+        
+        Automatically handles videos with or without audio streams.
+        - If video has no audio: adds the audio track
+        - If video has audio and replace_audio=True: replaces with new audio
+        - If video has audio and replace_audio=False: mixes both audio tracks
        
        Args:
            video: Video file path
@@ -259,6 +336,9 @@ class VideoService:
            audio_volume: Volume of the new audio (0.0 to 1.0+)
            video_volume: Volume of original video audio (0.0 to 1.0+)
                         Only used when replace_audio=False
+            pad_strategy: Strategy to pad video if audio is longer
+                         - "freeze": Freeze last frame (default)
+                         - "black": Fill with black screen
        
        Returns:
            Path to the output video file
@@ -267,28 +347,115 @@ class VideoService:
            RuntimeError: If FFmpeg execution fails
        
        Note:
-            - When replace_audio=True, video's original audio is removed
-            - When replace_audio=False, original and new audio are mixed
-            - Audio is trimmed/extended to match video duration
+            - Uses the longer duration between video and audio
+            - When audio is longer, video is padded using pad_strategy
+            - When video is longer, audio is looped or extended
+            - Automatically detects if video has audio
+            - When video is silent, audio is added regardless of replace_audio
+            - When replace_audio=True and video has audio, original audio is removed
+            - When replace_audio=False and video has audio, original and new audio are mixed
        """
+        # Get durations of video and audio
+        video_duration = self._get_video_duration(video)
+        audio_duration = self._get_audio_duration(audio)
+        
+        logger.info(f"Video duration: {video_duration:.2f}s, Audio duration: {audio_duration:.2f}s")
+        
+        # Determine target duration (max of both)
+        target_duration = max(video_duration, audio_duration)
+        logger.info(f"Target output duration: {target_duration:.2f}s")
+        
+        # Check if video has audio stream
+        video_has_audio = self.has_audio_stream(video)
+        
+        # Prepare video stream (potentially with padding)
+        input_video = ffmpeg.input(video)
+        video_stream = input_video.video
+        
+        # Pad video if audio is longer
+        if audio_duration > video_duration:
+            pad_duration = audio_duration - video_duration
+            logger.info(f"Audio is longer, padding video by {pad_duration:.2f}s using '{pad_strategy}' strategy")
+            
+            if pad_strategy == "freeze":
+                # Freeze last frame: tpad filter
+                video_stream = video_stream.filter('tpad', stop_mode='clone', stop_duration=pad_duration)
+            else:  # black
+                # Generate black frames for padding duration
+                from pixelle_video.utils.os_util import get_temp_path
+                import os
+                
+                # Get video properties
+                probe = ffmpeg.probe(video)
+                video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
+                width = int(video_info['width'])
+                height = int(video_info['height'])
+                fps_str = video_info['r_frame_rate']
+                fps_num, fps_den = map(int, fps_str.split('/'))
+                fps = fps_num / fps_den if fps_den != 0 else 30
+                
+                # Create black video for padding
+                black_video_path = get_temp_path(f"black_pad_{os.path.basename(output)}")
+                black_input = ffmpeg.input(
+                    f'color=c=black:s={width}x{height}:r={fps}',
+                    f='lavfi',
+                    t=pad_duration
+                )
+                
+                # Concatenate original video with black padding
+                video_stream = ffmpeg.concat(video_stream, black_input.video, v=1, a=0)
+        
+        # Prepare audio stream (pad if needed to match target duration)
+        input_audio = ffmpeg.input(audio)
+        audio_stream = input_audio.audio.filter('volume', audio_volume)
+        
+        # Pad audio with silence if video is longer
+        if video_duration > audio_duration:
+            pad_duration = video_duration - audio_duration
+            logger.info(f"Video is longer, padding audio with {pad_duration:.2f}s silence")
+            # Use apad to add silence at the end
+            audio_stream = audio_stream.filter('apad', whole_dur=target_duration)
+        
+        if not video_has_audio:
+            logger.info(f"Video has no audio stream, adding audio track")
+            # Video is silent, just add the audio
+            try:
+                (
+                    ffmpeg
+                    .output(
+                        video_stream,
+                        audio_stream,
+                        output,
+                        vcodec='libx264',  # Re-encode video if padded
+                        acodec='aac',
+                        audio_bitrate='192k'
+                    )
+                    .overwrite_output()
+                    .run(capture_stdout=True, capture_stderr=True)
+                )
+                
+                logger.success(f"Audio added to silent video: {output}")
+                return output
+            except ffmpeg.Error as e:
+                error_msg = e.stderr.decode() if e.stderr else str(e)
+                logger.error(f"FFmpeg error adding audio to silent video: {error_msg}")
+                raise RuntimeError(f"Failed to add audio to video: {error_msg}")
+        
+        # Video has audio, proceed with merging
        logger.info(f"Merging audio with video (replace={replace_audio})")
        
        try:
-            input_video = ffmpeg.input(video)
-            input_audio = ffmpeg.input(audio)
-            
            if replace_audio:
                # Replace audio: use only new audio, ignore original
                (
                    ffmpeg
                    .output(
-                        input_video.video,
-                        input_audio.audio.filter('volume', audio_volume),
+                        video_stream,
+                        audio_stream,
                        output,
-                        vcodec='copy',
+                        vcodec='libx264',  # Re-encode video if padded
                        acodec='aac',
-                        audio_bitrate='192k',
-                        shortest=None
+                        audio_bitrate='192k'
                    )
                    .overwrite_output()
                    .run(capture_stdout=True, capture_stderr=True)
@@ -298,20 +465,20 @@ class VideoService:
                mixed_audio = ffmpeg.filter(
                    [
                        input_video.audio.filter('volume', video_volume),
-                        input_audio.audio.filter('volume', audio_volume)
+                        audio_stream
                    ],
                    'amix',
                    inputs=2,
-                    duration='first'
+                    duration='longest'  # Use longest audio
                )
                
                (
                    ffmpeg
                    .output(
-                        input_video.video,
+                        video_stream,
                        mixed_audio,
                        output,
-                        vcodec='copy',
+                        vcodec='libx264',  # Re-encode video if padded
                        acodec='aac',
                        audio_bitrate='192k'
                    )
@@ -326,6 +493,92 @@ class VideoService:
            logger.error(f"FFmpeg merge error: {error_msg}")
            raise RuntimeError(f"Failed to merge audio and video: {error_msg}")
    
+    def overlay_image_on_video(
+        self,
+        video: str,
+        overlay_image: str,
+        output: str,
+        scale_mode: str = "contain"
+    ) -> str:
+        """
+        Overlay a transparent image on top of video
+        
+        Args:
+            video: Base video file path
+            overlay_image: Transparent overlay image path (e.g., rendered HTML with transparent background)
+            output: Output video file path
+            scale_mode: How to scale the base video to fit the overlay size
+                - "contain": Scale video to fit within overlay dimensions (letterbox/pillarbox)
+                - "cover": Scale video to cover overlay dimensions (may crop)
+                - "stretch": Stretch video to exact overlay dimensions
+        
+        Returns:
+            Path to the output video file
+        
+        Raises:
+            RuntimeError: If FFmpeg execution fails
+        
+        Note:
+            - Overlay image should have transparent background
+            - Video is scaled to match overlay dimensions based on scale_mode
+            - Final video size matches overlay image size
+            - Video codec is re-encoded to support overlay
+        """
+        logger.info(f"Overlaying image on video (scale_mode={scale_mode})")
+        
+        try:
+            # Get overlay image dimensions
+            overlay_probe = ffmpeg.probe(overlay_image)
+            overlay_stream = next(s for s in overlay_probe['streams'] if s['codec_type'] == 'video')
+            overlay_width = int(overlay_stream['width'])
+            overlay_height = int(overlay_stream['height'])
+            
+            logger.debug(f"Overlay dimensions: {overlay_width}x{overlay_height}")
+            
+            input_video = ffmpeg.input(video)
+            input_overlay = ffmpeg.input(overlay_image)
+            
+            # Scale video to fit overlay size using scale_mode
+            if scale_mode == "contain":
+                # Scale to fit (letterbox/pillarbox if aspect ratio differs)
+                # Use scale filter with force_original_aspect_ratio=decrease and pad to center
+                scaled_video = (
+                    input_video
+                    .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='decrease')
+                    .filter('pad', overlay_width, overlay_height, '(ow-iw)/2', '(oh-ih)/2', color='black')
+                )
+            elif scale_mode == "cover":
+                # Scale to cover (crop if aspect ratio differs)
+                scaled_video = (
+                    input_video
+                    .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='increase')
+                    .filter('crop', overlay_width, overlay_height)
+                )
+            else:  # stretch
+                # Stretch to exact dimensions
+                scaled_video = input_video.filter('scale', overlay_width, overlay_height)
+            
+            # Overlay the transparent image on top of the scaled video
+            output_stream = ffmpeg.overlay(scaled_video, input_overlay)
+            
+            (
+                ffmpeg
+                .output(output_stream, output, 
+                        vcodec='libx264',
+                        pix_fmt='yuv420p',
+                        preset='medium',
+                        crf=23)
+                .overwrite_output()
+                .run(capture_stdout=True, capture_stderr=True)
+            )
+            
+            logger.success(f"Image overlaid on video: {output}")
+            return output
+        except ffmpeg.Error as e:
+            error_msg = e.stderr.decode() if e.stderr else str(e)
+            logger.error(f"FFmpeg overlay error: {error_msg}")
+            raise RuntimeError(f"Failed to overlay image on video: {error_msg}")
+    
    def create_video_from_image(
        self,
        image: str,
--- a/pixelle_video/utils/content_generators.py
+++ b/pixelle_video/utils/content_generators.py
@@ -321,6 +321,98 @@ async def generate_image_prompts(
    return all_prompts


+async def generate_video_prompts(
+    llm_service,
+    narrations: List[str],
+    min_words: int = 30,
+    max_words: int = 60,
+    batch_size: int = 10,
+    max_retries: int = 3,
+    progress_callback: Optional[callable] = None
+) -> List[str]:
+    """
+    Generate video prompts from narrations (with batching and retry)
+    
+    Args:
+        llm_service: LLM service instance
+        narrations: List of narrations
+        min_words: Min video prompt length
+        max_words: Max video prompt length
+        batch_size: Max narrations per batch (default: 10)
+        max_retries: Max retry attempts per batch (default: 3)
+        progress_callback: Optional callback(completed, total, message) for progress updates
+    
+    Returns:
+        List of video prompts (base prompts, without prefix applied)
+    """
+    from pixelle_video.prompts.video_generation import build_video_prompt_prompt
+    
+    logger.info(f"Generating video prompts for {len(narrations)} narrations (batch_size={batch_size})")
+    
+    # Split narrations into batches
+    batches = [narrations[i:i + batch_size] for i in range(0, len(narrations), batch_size)]
+    logger.info(f"Split into {len(batches)} batches")
+    
+    all_prompts = []
+    
+    # Process each batch
+    for batch_idx, batch_narrations in enumerate(batches, 1):
+        logger.info(f"Processing batch {batch_idx}/{len(batches)} ({len(batch_narrations)} narrations)")
+        
+        # Retry logic for this batch
+        for attempt in range(1, max_retries + 1):
+            try:
+                # Generate prompts for this batch
+                prompt = build_video_prompt_prompt(
+                    narrations=batch_narrations,
+                    min_words=min_words,
+                    max_words=max_words
+                )
+                
+                response = await llm_service(
+                    prompt=prompt,
+                    temperature=0.7,
+                    max_tokens=8192
+                )
+                
+                logger.debug(f"Batch {batch_idx} attempt {attempt}: LLM response length: {len(response)} chars")
+                
+                # Parse JSON
+                result = _parse_json(response)
+                
+                if "video_prompts" not in result:
+                    raise KeyError("Invalid response format: missing 'video_prompts'")
+                
+                batch_prompts = result["video_prompts"]
+                
+                # Validate batch result
+                if len(batch_prompts) != len(batch_narrations):
+                    raise ValueError(
+                        f"Prompt count mismatch: expected {len(batch_narrations)}, got {len(batch_prompts)}"
+                    )
+                
+                # Success - add to all_prompts
+                all_prompts.extend(batch_prompts)
+                logger.info(f"✓ Batch {batch_idx} completed: {len(batch_prompts)} video prompts")
+                
+                # Report progress
+                if progress_callback:
+                    completed = len(all_prompts)
+                    total = len(narrations)
+                    progress_callback(completed, total, f"Batch {batch_idx}/{len(batches)} completed")
+                
+                break  # Success, move to next batch
+            
+            except Exception as e:
+                logger.warning(f"✗ Batch {batch_idx} attempt {attempt} failed: {e}")
+                if attempt >= max_retries:
+                    raise
+                logger.info(f"Retrying batch {batch_idx}...")
+    
+    logger.info(f"✅ Generated {len(all_prompts)} video prompts")
+    return all_prompts
+
+
 def _parse_json(text: str) -> dict:
    """
    Parse JSON from text, with fallback to extract JSON from markdown code blocks
--- a/pixelle_video/utils/os_util.py
+++ b/pixelle_video/utils/os_util.py
@@ -260,7 +260,7 @@ def get_task_path(task_id: str, *paths: str) -> str:
 def get_task_frame_path(
    task_id: str, 
    frame_index: int, 
-    file_type: Literal["audio", "image", "composed", "segment"]
+    file_type: Literal["audio", "image", "video", "composed", "segment"]
 ) -> str:
    """
    Get frame file path within task directory
@@ -268,7 +268,7 @@ def get_task_frame_path(
    Args:
        task_id: Task ID
        frame_index: Frame index (0-based internally, but filename starts from 01)
-        file_type: File type (audio/image/composed/segment)
+        file_type: File type (audio/image/video/composed/segment)
    
    Returns:
        Absolute path to frame file
@@ -280,6 +280,7 @@ def get_task_frame_path(
    ext_map = {
        "audio": "mp3",
        "image": "png",
+        "video": "mp4",
        "composed": "png",
        "segment": "mp4"
    }
--- a/pixelle_video/utils/template_util.py
+++ b/pixelle_video/utils/template_util.py
@@ -18,6 +18,7 @@ import os
 from pathlib import Path
 from typing import List, Tuple, Optional, Literal
 from pydantic import BaseModel, Field
+import logging

 from pixelle_video.utils.os_util import (
    get_resource_path,
@@ -26,6 +27,8 @@ from pixelle_video.utils.os_util import (
    resource_exists
 )

+logger = logging.getLogger(__name__)
+

 def parse_template_size(template_path: str) -> Tuple[int, int]:
    """
@@ -316,7 +319,7 @@ def resolve_template_path(template_input: Optional[str]) -> str:
    
    Args:
        template_input: Can be:
-            - None: Use default "1080x1920/default.html"
+            - None: Use default "1080x1920/image_default.html"
            - "template.html": Use default size + this template
            - "1080x1920/template.html": Full relative path
            - "templates/1080x1920/template.html": Absolute-ish path (legacy)
@@ -330,15 +333,15 @@ def resolve_template_path(template_input: Optional[str]) -> str:
    
    Examples:
        >>> resolve_template_path(None)
-        'templates/1080x1920/default.html'
-        >>> resolve_template_path("modern.html")
-        'templates/1080x1920/modern.html'
-        >>> resolve_template_path("1920x1080/default.html")
-        'templates/1920x1080/default.html'
+        'templates/1080x1920/image_default.html'
+        >>> resolve_template_path("image_modern.html")
+        'templates/1080x1920/image_modern.html'
+        >>> resolve_template_path("1920x1080/image_default.html")
+        'templates/1920x1080/image_default.html'
    """
    # Default case
    if template_input is None:
-        template_input = "1080x1920/default.html"
+        template_input = "1080x1920/image_default.html"
    
    # Parse input to extract size and template name
    size = None
@@ -359,6 +362,18 @@ def resolve_template_path(template_input: Optional[str]) -> str:
        size = "1080x1920"
        template_name = template_input
    
+    # Backward compatibility: migrate "default.html" to "image_default.html"
+    if template_name == "default.html":
+        migrated_name = "image_default.html"
+        try:
+            # Try migrated name first
+            path = get_resource_path("templates", size, migrated_name)
+            logger.info(f"Backward compatibility: migrated '{template_input}' to '{size}/{migrated_name}'")
+            return path
+        except FileNotFoundError:
+            # Fall through to try original name
+            logger.warning(f"Migrated template '{size}/{migrated_name}' not found, trying original name")
+    
    # Use resource API to resolve path (custom > default)
    try:
        return get_resource_path("templates", size, template_name)
@@ -367,6 +382,120 @@ def resolve_template_path(template_input: Optional[str]) -> str:
        raise FileNotFoundError(
            f"Template not found: {size}/{template_name}\n"
            f"Available sizes: {available_sizes}\n"
-            f"Hint: Use format 'SIZExSIZE/template.html' (e.g., '1080x1920/default.html')"
+            f"Hint: Use format 'SIZExSIZE/template.html' (e.g., '1080x1920/image_default.html')"
        )

+
+def get_template_type(template_name: str) -> Literal['static', 'image', 'video']:
+    """
+    Detect template type from template filename
+    
+    Template naming convention:
+    - static_*.html: Static style templates (no AI-generated media)
+    - image_*.html: Templates requiring AI-generated images
+    - video_*.html: Templates requiring AI-generated videos
+    
+    Args:
+        template_name: Template filename like "image_default.html" or "video_simple.html"
+    
+    Returns:
+        Template type: 'static', 'image', or 'video'
+    
+    Examples:
+        >>> get_template_type("static_simple.html")
+        'static'
+        >>> get_template_type("image_default.html")
+        'image'
+        >>> get_template_type("video_simple.html")
+        'video'
+    """
+    name = Path(template_name).name
+    
+    if name.startswith("static_"):
+        return "static"
+    elif name.startswith("video_"):
+        return "video"
+    elif name.startswith("image_"):
+        return "image"
+    else:
+        # Fallback: try to detect from legacy names
+        logger.warning(
+            f"Template '{template_name}' doesn't follow naming convention (static_/image_/video_). "
+            f"Defaulting to 'image' type."
+        )
+        return "image"
+
+
+def filter_templates_by_type(
+    templates: List[TemplateInfo], 
+    template_type: Literal['static', 'image', 'video']
+) -> List[TemplateInfo]:
+    """
+    Filter templates by type
+    
+    Args:
+        templates: List of TemplateInfo objects
+        template_type: Type to filter by ('static', 'image', or 'video')
+    
+    Returns:
+        Filtered list of TemplateInfo objects
+    
+    Examples:
+        >>> all_templates = get_all_templates_with_info()
+        >>> image_templates = filter_templates_by_type(all_templates, 'image')
+        >>> len(image_templates) > 0
+        True
+    """
+    filtered = []
+    for t in templates:
+        template_name = t.display_info.name
+        if get_template_type(template_name) == template_type:
+            filtered.append(t)
+    return filtered
+
+
+def get_templates_grouped_by_size_and_type(
+    template_type: Optional[Literal['static', 'image', 'video']] = None
+) -> dict:
+    """
+    Get templates grouped by size, optionally filtered by type
+    
+    Args:
+        template_type: Optional type filter ('static', 'image', or 'video')
+    
+    Returns:
+        Dict with size as key, list of TemplateInfo as value
+        Ordered by orientation priority: portrait > landscape > square
+    
+    Examples:
+        >>> # Get all templates
+        >>> all_grouped = get_templates_grouped_by_size_and_type()
+        
+        >>> # Get only image templates
+        >>> image_grouped = get_templates_grouped_by_size_and_type('image')
+    """
+    from collections import defaultdict
+    
+    templates = get_all_templates_with_info()
+    
+    # Filter by type if specified
+    if template_type is not None:
+        templates = filter_templates_by_type(templates, template_type)
+    
+    grouped = defaultdict(list)
+    
+    for t in templates:
+        grouped[t.display_info.size].append(t)
+    
+    # Sort groups by orientation priority: portrait > landscape > square
+    orientation_priority = {'portrait': 0, 'landscape': 1, 'square': 2}
+    
+    sorted_grouped = {}
+    for size in sorted(grouped.keys(), key=lambda s: (
+        orientation_priority.get(grouped[s][0].display_info.orientation, 3),
+        s
+    )):
+        sorted_grouped[size] = sorted(grouped[size], key=lambda t: t.display_info.name)
+    
+    return sorted_grouped
+
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
    "uvicorn[standard]>=0.32.0",
    "python-multipart>=0.0.12",
    "comfykit>=0.1.9",
+    "beautifulsoup4>=4.14.2",
 ]

 [project.optional-dependencies]
--- a/templates/1080x1080/image_minimal_framed.html
+++ b/templates/1080x1080/image_minimal_framed.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1080, height=1080">
    <title>极简边框风格 - 1080x1080</title>
    <style>
--- a/templates/1080x1920/image_blur_card.html
+++ b/templates/1080x1920/image_blur_card.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1080, height=1920">
    <title>模糊背景卡片 - 1080x1920</title>
    <!-- Google Fonts - 中文字体 -->
--- a/templates/1080x1920/image_cartoon.html
+++ b/templates/1080x1920/image_cartoon.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{{title}}</title>
    <style>
--- a/templates/1080x1920/image_default.html
+++ b/templates/1080x1920/image_default.html
@@ -2,6 +2,8 @@
 <html>
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <style>
        html {
            margin: 0;
--- a/templates/1080x1920/image_elegant.html
+++ b/templates/1080x1920/image_elegant.html
@@ -2,6 +2,8 @@
 <html>
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <style>
        html {
            margin: 0;
--- a/templates/1080x1920/image_fashion_vintage.html
+++ b/templates/1080x1920/image_fashion_vintage.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1080, height=1920">
    <title>时尚复古风格 - 1080x1920</title>
    <style>
--- a/templates/1080x1920/image_full.html
+++ b/templates/1080x1920/image_full.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1080, height=1920">
    <title>全屏图片 - 1080x1920</title>
    <!-- Google Fonts - 中文字体 -->
--- a/templates/1080x1920/image_life_insights.html
+++ b/templates/1080x1920/image_life_insights.html
@@ -2,6 +2,8 @@
 <html>
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Ma+Shan+Zheng&family=ZCOOL+KuaiLe&display=swap" rel="stylesheet">
--- a/templates/1080x1920/image_modern.html
+++ b/templates/1080x1920/image_modern.html
@@ -2,6 +2,8 @@
 <html>
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <style>
        html {
            margin: 0;
--- a/templates/1080x1920/image_neon.html
+++ b/templates/1080x1920/image_neon.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
  <meta charset="UTF-8" />
+  <meta name="template:media-width" content="1024">
+  <meta name="template:media-height" content="1024">
  <meta name="viewport" content="width=device-width, initial-scale=1" />
  <title>{{title}}</title>
  <style>
--- a/templates/1080x1920/image_psychology_card.html
+++ b/templates/1080x1920/image_psychology_card.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1080, height=1920">
    <title>心理卡片风 - 1080x1920</title>
    <style>
--- a/templates/1080x1920/image_purple.html
+++ b/templates/1080x1920/image_purple.html
@@ -2,6 +2,8 @@
 <html>
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <style>
        html {
            margin: 0;
--- a/templates/1080x1920/static_default.html
+++ b/templates/1080x1920/static_default.html
@@ -2,6 +2,8 @@
 <html>
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <style>
        html {
            margin: 0;
--- a/templates/1080x1920/video_default.html
+++ b/templates/1080x1920/video_default.html
@@ -0,0 +1,185 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="template:media-width" content="512">
+    <meta name="template:media-height" content="288">
+    <style>
+        html {
+            margin: 0;
+            padding: 0;
+            height: 100%;
+        }
+        
+        body {
+            margin: 0;
+            padding: 0;
+            width: 100%;
+            height: 100vh;
+            font-family: 'PingFang SC', 'Source Han Sans', 'Microsoft YaHei', sans-serif;
+            overflow: hidden;
+            /* background-color: #000; */
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+        
+        /* 主容器 - 居中并包含所有内容 */
+        .main-container {
+            position: relative;
+            width: 1080px;
+            height: 1920px;
+        }
+        
+        /* Background image layer (customizable using <img> tag) */
+        .background-image {
+            position: absolute;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            z-index: 0;
+        }
+        
+        /* Video overlay - 相对于main-container居中 */
+        .video-overlay {
+            position: absolute;
+            top: 50%;
+            left: 50%;
+            transform: translate(-50%, -50%);
+            width: 1080px;
+            height: 607px;
+            /* background: #f00; */
+            z-index: 1;
+        }
+        
+        /* Title section - positioned above video */
+        .video-title-wrapper {
+            position: absolute;
+            top: calc(50% - 607px / 2 - 130px);
+            left: 50%;
+            transform: translateX(-50%);
+            max-width: 900px;
+            width: 900px;
+            text-align: center;
+            z-index: 2;
+        }
+        
+        .video-title {
+            font-size: 72px;
+            font-weight: 700;
+            color: #ffffff;
+            line-height: 1.3;
+            letter-spacing: 3px;
+            text-shadow: 0 4px 20px rgba(0, 0, 0, 0.3);
+            margin-bottom: 20px;
+        }
+        
+        /* 字幕区域 - 对齐视频底部 */
+        .content {
+            position: absolute;
+            bottom: calc(50% - 607px / 2 + 0px);
+            left: 50%;
+            transform: translateX(-50%);
+            width: 900px;
+            z-index: 4;
+        }
+        
+        .text {
+            font-size: 40px;
+            color: #ffffff;
+            text-align: center;
+            line-height: 1.6;
+            font-weight: 500;
+            text-shadow: 
+                2px 2px 4px rgba(0, 0, 0, 0.9),
+                0 0 8px rgba(0, 0, 0, 0.8),
+                0 0 16px rgba(0, 0, 0, 0.6);
+            padding: 10px 0px;
+            /* background-color: aqua; */
+        }
+        
+        /* Footer - positioned below video */
+        .footer {
+            position: absolute;
+            top: calc(50% + 607px / 2 + 50px);
+            left: 50%;
+            transform: translateX(-50%);
+            width: 900px;
+            display: flex;
+            align-items: center;
+            justify-content: space-between;
+            padding-top: 40px;
+            border-top: 2px solid rgba(255, 255, 255, 0.3);
+            z-index: 2;
+        }
+        
+        .author-section {
+            display: flex;
+            flex-direction: column;
+            gap: 8px;
+        }
+        
+        .author {
+            font-size: 32px;
+            font-weight: 600;
+            color: #ffffff;
+            text-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
+        }
+        
+        .author-desc {
+            font-size: 24px;
+            color: rgba(255, 255, 255, 0.9);
+            font-weight: 400;
+        }
+        
+        .logo-section {
+            display: flex;
+            flex-direction: column;
+            align-items: flex-end;
+            gap: 10px;
+        }
+        
+        .logo {
+            font-size: 28px;
+            font-weight: 600;
+            color: #ffffff;
+            letter-spacing: 2px;
+            text-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
+        }
+    </style>
+</head>
+<body>
+    <!-- 主容器 - 所有元素都在这里面，相对于video-overlay定位 -->
+    <div class="main-container">
+        <!-- Background image layer (customizable via background parameter) -->
+        <div class="background-image">
+            
+        </div>
+        
+        <!-- Video overlay - 居中参考点 -->
+        <div class="video-overlay"></div>
+        
+        <!-- Video title - positioned above video -->
+        <div class="video-title-wrapper">
+            <div class="video-title">{{title}}</div>
+        </div>
+        
+        <!-- 字幕区域 - 独立定位在视频底部 -->
+        <div class="content">
+            <div class="text">{{text}}</div>
+        </div>
+        
+        <!-- Footer - positioned below video -->
+        <div class="footer">
+            <div class="author-section">
+                <div class="author">{{author=@Pixelle.AI}}</div>
+                <div class="author-desc">{{describe=Open Source Omnimodal AI Creative Agent}}</div>
+            </div>
+            <div class="logo-section">
+                <div class="logo">{{brand=Pixelle-Video}}</div>
+            </div>
+        </div>
+    </div>
+</body>
+</html>
--- a/templates/1920x1080/image_film.html
+++ b/templates/1920x1080/image_film.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1920, height=1080">
    <title>视频模板 - 电影风格</title>
    <style>
--- a/templates/1920x1080/image_full.html
+++ b/templates/1920x1080/image_full.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1920, height=1080">
    <title>全屏图片 - 1920x1080</title>
    <!-- Google Fonts - 中文字体 -->
--- a/templates/1920x1080/image_ultrawide_minimal.html
+++ b/templates/1920x1080/image_ultrawide_minimal.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1920, height=1080">
    <title>视频模板 - 极简风格</title>
    <style>
--- a/templates/1920x1080/image_wide_darktech.html
+++ b/templates/1920x1080/image_wide_darktech.html
@@ -2,6 +2,8 @@
 <html lang="zh-CN">
 <head>
    <meta charset="UTF-8">
+    <meta name="template:media-width" content="1024">
+    <meta name="template:media-height" content="1024">
    <meta name="viewport" content="width=1920, height=1080">
    <title>视频模板 - 横屏科技风格</title>
    <style>
--- a/uv.lock
+++ b/uv.lock
@@ -226,6 +226,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/2f/eb/f25ad1a7726b2fe21005c3580b35fa7bfe09646faf7c8f41867747987a35/beartype-0.22.4-py3-none-any.whl", hash = "sha256:7967a1cee01fee42e47da69c58c92da10ba5bcfb8072686e48487be5201e3d10", size = 1318387 },
 ]

+[[package]]
+name = "beautifulsoup4"
+version = "4.14.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/e9/df2358efd7659577435e2177bfa69cba6c33216681af51a707193dec162a/beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e", size = 625822 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/94/fe/3aed5d0be4d404d12d36ab97e2f1791424d9ca39c2f754a6285d59a3b01d/beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515", size = 106392 },
+]
+
 [[package]]
 name = "blinker"
 version = "1.9.0"
@@ -1653,6 +1666,7 @@ name = "pixelle-video"
 version = "0.1.2"
 source = { editable = "." }
 dependencies = [
+    { name = "beautifulsoup4" },
    { name = "certifi" },
    { name = "comfykit" },
    { name = "edge-tts" },
@@ -1680,6 +1694,7 @@ dev = [

 [package.metadata]
 requires-dist = [
+    { name = "beautifulsoup4", specifier = ">=4.14.2" },
    { name = "certifi", specifier = ">=2025.10.5" },
    { name = "comfykit", specifier = ">=0.1.9" },
    { name = "edge-tts", specifier = ">=7.2.3" },
@@ -2461,6 +2476,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
 ]

+[[package]]
+name = "soupsieve"
+version = "2.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/e6/21ccce3262dd4889aa3332e5a119a3491a95e8f60939870a3a035aabac0d/soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f", size = 103472 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/a0/bb38d3b76b8cae341dad93a2dd83ab7462e6dbcdd84d43f54ee60a8dc167/soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c", size = 36679 },
+]
+
 [[package]]
 name = "sse-starlette"
 version = "3.0.3"
--- a/web/app.py
+++ b/web/app.py
@@ -684,13 +684,41 @@ def main():
            st.markdown(f"🔗 [{tr('template.preview_link')}]({template_docs_url})")
            
            # Import template utilities
-            from pixelle_video.utils.template_util import get_templates_grouped_by_size
+            from pixelle_video.utils.template_util import get_templates_grouped_by_size_and_type, get_template_type
            
-            # Get templates grouped by size
-            grouped_templates = get_templates_grouped_by_size()
+            # Template type selector
+            st.markdown(f"**{tr('template.type_selector')}**")
+            
+            template_type_options = {
+                'static': tr('template.type.static'),
+                'image': tr('template.type.image'),
+                'video': tr('template.type.video')
+            }
+            
+            # Radio buttons in horizontal layout
+            selected_template_type = st.radio(
+                tr('template.type_selector'),
+                options=list(template_type_options.keys()),
+                format_func=lambda x: template_type_options[x],
+                index=1,  # Default to 'image'
+                key="template_type_selector",
+                label_visibility="collapsed",
+                horizontal=True
+            )
+            
+            # Display hint based on selected type (below radio buttons)
+            if selected_template_type == 'static':
+                st.info(tr('template.type.static_hint'))
+            elif selected_template_type == 'image':
+                st.info(tr('template.type.image_hint'))
+            elif selected_template_type == 'video':
+                st.info(tr('template.type.video_hint'))
+            
+            # Get templates grouped by size, filtered by selected type
+            grouped_templates = get_templates_grouped_by_size_and_type(selected_template_type)
            
            if not grouped_templates:
-                st.error("No templates found. Please ensure templates are in templates/ directory with proper structure (e.g., templates/1080x1920/default.html).")
+                st.warning(f"No {template_type_options[selected_template_type]} templates found. Please select a different type or add templates.")
                st.stop()
            
            # Build display options with group separators
@@ -707,7 +735,19 @@ def main():
            
            # Get default template from config
            template_config = pixelle_video.config.get("template", {})
-            config_default_template = template_config.get("default_template", "1080x1920/default.html")
+            config_default_template = template_config.get("default_template", "1080x1920/image_default.html")
+
+            # Backward compatibility
+            if config_default_template == "1080x1920/default.html":
+                config_default_template = "1080x1920/image_default.html"
+            
+            # Determine type-specific default template
+            type_default_templates = {
+                'static': '1080x1920/static_default.html',
+                'image': '1080x1920/image_default.html',
+                'video': '1080x1920/video_default.html'
+            }
+            type_specific_default = type_default_templates.get(selected_template_type, config_default_template)
            
            for size, templates in grouped_templates.items():
                if not templates:
@@ -733,10 +773,12 @@ def main():
                    display_options.append(display_name)
                    template_paths_ordered.append(t.template_path)  # Add to ordered list
                    
-                    # Set default based on config (priority: config > first default.html in portrait)
+                    # Set default: priority is config > type-specific default > first in portrait
                    if t.template_path == config_default_template:
                        default_index = current_index
-                    elif default_index == 0 and "default.html" in t.display_info.name and t.display_info.orientation == 'portrait':
+                    elif default_index == 0 and t.template_path == type_specific_default:
+                        default_index = current_index
+                    elif default_index == 0 and t.display_info.orientation == 'portrait':
                        default_index = current_index
                    
                    current_index += 1
@@ -782,10 +824,25 @@ def main():
            generator_for_params = HTMLFrameGenerator(template_path_for_params)
            custom_params_for_video = generator_for_params.parse_template_parameters()
            
-            # Detect if template requires image generation
-            template_requires_image = generator_for_params.requires_image()
-            # Store in session state for Image Section to read
-            st.session_state['template_requires_image'] = template_requires_image
+            # Get media size from template (for image/video generation)
+            media_width, media_height = generator_for_params.get_media_size()
+            st.session_state['template_media_width'] = media_width
+            st.session_state['template_media_height'] = media_height
+            
+            # Detect template media type
+            from pathlib import Path
+            from pixelle_video.utils.template_util import get_template_type
+            
+            template_name = Path(frame_template).name
+            template_media_type = get_template_type(template_name)
+            template_requires_media = (template_media_type in ["image", "video"])
+            
+            # Store in session state for workflow filtering
+            st.session_state['template_media_type'] = template_media_type
+            st.session_state['template_requires_media'] = template_requires_media
+            
+            # Backward compatibility
+            st.session_state['template_requires_image'] = (template_media_type == "image")
            
            custom_values_for_video = {}
            if custom_params_for_video:
@@ -928,25 +985,51 @@ def main():
                            logger.exception(e)
        
        # ====================================================================
-        # Image Generation Section (conditional based on template)
+        # Media Generation Section (conditional based on template)
        # ====================================================================
-        # Check if current template requires image generation
-        if st.session_state.get('template_requires_image', True):
-            # Template requires images - show full Image Section
+        # Check if current template requires media generation
+        template_media_type = st.session_state.get('template_media_type', 'image')
+        template_requires_media = st.session_state.get('template_requires_media', True)
+        
+        if template_requires_media:
+            # Template requires media - show Media Generation Section
            with st.container(border=True):
-                st.markdown(f"**{tr('section.image')}**")
+                # Dynamic section title based on template type
+                if template_media_type == "video":
+                    section_title = tr('section.video')
+                else:
+                    section_title = tr('section.image')
+                
+                st.markdown(f"**{section_title}**")
            
                # 1. ComfyUI Workflow selection
                with st.expander(tr("help.feature_description"), expanded=False):
                    st.markdown(f"**{tr('help.what')}**")
-                    st.markdown(tr("style.workflow_what"))
+                    if template_media_type == "video":
+                        st.markdown(tr('style.video_workflow_what'))
+                    else:
+                        st.markdown(tr("style.workflow_what"))
                    st.markdown(f"**{tr('help.how')}**")
-                    st.markdown(tr("style.workflow_how"))
+                    if template_media_type == "video":
+                        st.markdown(tr('style.video_workflow_how'))
+                    else:
+                        st.markdown(tr("style.workflow_how"))
                    st.markdown(f"**{tr('help.note')}**")
-                    st.markdown(tr("style.image_size_note"))
+                    if template_media_type == "video":
+                        st.markdown(tr('style.video_size_note'))
+                    else:
+                        st.markdown(tr("style.image_size_note"))
            
-                # Get available workflows from pixelle_video (with source info)
-                workflows = pixelle_video.image.list_workflows()
+                # Get available workflows and filter by template type
+                all_workflows = pixelle_video.media.list_workflows()
+                
+                # Filter workflows based on template media type
+                if template_media_type == "video":
+                    # Only show video_ workflows
+                    workflows = [wf for wf in all_workflows if "video_" in wf["key"].lower()]
+                else:
+                    # Only show image_ workflows (exclude video_)
+                    workflows = [wf for wf in all_workflows if "video_" not in wf["key"].lower()]
            
                # Build options for selectbox
                # Display: "image_flux.json - Runninghub"
@@ -959,7 +1042,9 @@ def main():
            
                # If user has a saved preference in config, try to match it
                comfyui_config = config_manager.get_comfyui_config()
-                saved_workflow = comfyui_config["image"]["default_workflow"]
+                # Select config based on template type (image or video)
+                media_config_key = "video" if template_media_type == "video" else "image"
+                saved_workflow = comfyui_config.get(media_config_key, {}).get("default_workflow", "")
                if saved_workflow and saved_workflow in workflow_keys:
                    default_workflow_index = workflow_keys.index(saved_workflow)
            
@@ -978,31 +1063,20 @@ def main():
                else:
                    workflow_key = "runninghub/image_flux.json"  # fallback
            
+                # Get media size from template
+                image_width = st.session_state.get('template_media_width', 1024)
+                image_height = st.session_state.get('template_media_height', 1024)
+                
+                # Display media size info (read-only)
+                if template_media_type == "video":
+                    size_info_text = tr('style.video_size_info', width=image_width, height=image_height)
+                else:
+                    size_info_text = tr('style.image_size_info', width=image_width, height=image_height)
+                st.info(f"📐 {size_info_text}")
            
-                # 2. Image size input
-                col1, col2 = st.columns(2)
-                with col1:
-                    image_width = st.number_input(
-                        tr('style.image_width'),
-                        min_value=128,
-                        value=1024,
-                        step=1,
-                        label_visibility="visible",
-                        help=tr('style.image_width_help')
-                    )
-                with col2:
-                    image_height = st.number_input(
-                        tr('style.image_height'),
-                        min_value=128,
-                        value=1024,
-                        step=1,
-                        label_visibility="visible",
-                        help=tr('style.image_height_help')
-                    )
-            
-                # 3. Prompt prefix input
-                # Get current prompt_prefix from config
-                current_prefix = comfyui_config["image"]["prompt_prefix"]
+                # Prompt prefix input
+                # Get current prompt_prefix from config (based on media type)
+                current_prefix = comfyui_config.get(media_config_key, {}).get("prompt_prefix", "")
            
                # Prompt prefix input (temporary, not saved to config)
                prompt_prefix = st.text_area(
@@ -1014,54 +1088,71 @@ def main():
                    help=tr("style.prompt_prefix_help")
                )
            
-                # Style preview expander (similar to template preview)
-                with st.expander(tr("style.preview_title"), expanded=False):
+                # Media preview expander
+                preview_title = tr("style.video_preview_title") if template_media_type == "video" else tr("style.preview_title")
+                with st.expander(preview_title, expanded=False):
                    # Test prompt input
+                    if template_media_type == "video":
+                        test_prompt_label = tr("style.test_video_prompt")
+                        test_prompt_value = "a dog running in the park"
+                    else:
+                        test_prompt_label = tr("style.test_prompt")
+                        test_prompt_value = "a dog"
+                    
                    test_prompt = st.text_input(
-                        tr("style.test_prompt"),
-                        value="a dog",
+                        test_prompt_label,
+                        value=test_prompt_value,
                        help=tr("style.test_prompt_help"),
                        key="style_test_prompt"
                    )
                
                    # Preview button
-                    if st.button(tr("style.preview"), key="preview_style", use_container_width=True):
-                        with st.spinner(tr("style.previewing")):
+                    preview_button_label = tr("style.video_preview") if template_media_type == "video" else tr("style.preview")
+                    if st.button(preview_button_label, key="preview_style", use_container_width=True):
+                        previewing_text = tr("style.video_previewing") if template_media_type == "video" else tr("style.previewing")
+                        with st.spinner(previewing_text):
                            try:
                                from pixelle_video.utils.prompt_helper import build_image_prompt
                            
                                # Build final prompt with prefix
                                final_prompt = build_image_prompt(test_prompt, prompt_prefix)
                            
-                                # Generate preview image (use user-specified size)
-                                preview_image_path = run_async(pixelle_video.image(
+                                # Generate preview media (use user-specified size and media type)
+                                media_result = run_async(pixelle_video.media(
                                    prompt=final_prompt,
                                    workflow=workflow_key,
+                                    media_type=template_media_type,
                                    width=int(image_width),
                                    height=int(image_height)
                                ))
+                                preview_media_path = media_result.url
                            
                                # Display preview (support both URL and local path)
-                                if preview_image_path:
-                                    st.success(tr("style.preview_success"))
+                                if preview_media_path:
+                                    success_text = tr("style.video_preview_success") if template_media_type == "video" else tr("style.preview_success")
+                                    st.success(success_text)
                                
-                                    # Read and encode image
-                                    if preview_image_path.startswith('http'):
-                                        # URL - use directly
-                                        img_html = f'<div class="preview-image"><img src="{preview_image_path}" alt="Style Preview"/></div>'
+                                    if template_media_type == "video":
+                                        # Display video
+                                        st.video(preview_media_path)
                                    else:
-                                        # Local file - encode as base64
-                                        with open(preview_image_path, 'rb') as f:
-                                            img_data = base64.b64encode(f.read()).decode()
-                                        img_html = f'<div class="preview-image"><img src="data:image/png;base64,{img_data}" alt="Style Preview"/></div>'
-                                
-                                    st.markdown(img_html, unsafe_allow_html=True)
+                                        # Display image
+                                        if preview_media_path.startswith('http'):
+                                            # URL - use directly
+                                            img_html = f'<div class="preview-image"><img src="{preview_media_path}" alt="Style Preview"/></div>'
+                                        else:
+                                            # Local file - encode as base64
+                                            with open(preview_media_path, 'rb') as f:
+                                                img_data = base64.b64encode(f.read()).decode()
+                                            img_html = f'<div class="preview-image"><img src="data:image/png;base64,{img_data}" alt="Style Preview"/></div>'
+                                        
+                                        st.markdown(img_html, unsafe_allow_html=True)
                                
                                    # Show the final prompt used
                                    st.info(f"**{tr('style.final_prompt_label')}**\n{final_prompt}")
                                
                                    # Show file path
-                                    st.caption(f"📁 {preview_image_path}")
+                                    st.caption(f"📁 {preview_media_path}")
                                else:
                                    st.error(tr("style.preview_failed_general"))
                            except Exception as e:
@@ -1076,10 +1167,12 @@ def main():
                st.info("ℹ️ " + tr("image.not_required"))
                st.caption(tr("image.not_required_hint"))
                
+                # Get media size from template (even though not used, for consistency)
+                image_width = st.session_state.get('template_media_width', 1024)
+                image_height = st.session_state.get('template_media_height', 1024)
+                
                # Set default values for later use
                workflow_key = None
-                image_width = 1024
-                image_height = 1024
                prompt_prefix = ""
        

@@ -1149,14 +1242,13 @@ def main():
                        progress_bar.progress(min(int(event.progress * 100), 99))  # Cap at 99% until complete
                    
                    # Generate video (directly pass parameters)
+                    # Note: image_width and image_height are now auto-determined from template
                    video_params = {
                        "text": text,
                        "mode": mode,
                        "title": title if title else None,
                        "n_scenes": n_scenes,
                        "image_workflow": workflow_key,
-                        "image_width": int(image_width),
-                        "image_height": int(image_height),
                        "frame_template": frame_template,
                        "prompt_prefix": prompt_prefix,
                        "bgm_path": bgm_path,
@@ -1211,6 +1303,18 @@ def main():
                    # Video preview
                    if os.path.exists(result.video_path):
                        st.video(result.video_path)
+                        
+                        # Download button
+                        with open(result.video_path, "rb") as video_file:
+                            video_bytes = video_file.read()
+                            video_filename = os.path.basename(result.video_path)
+                            st.download_button(
+                                label="⬇️ 下载视频" if get_language() == "zh_CN" else "⬇️ Download Video",
+                                data=video_bytes,
+                                file_name=video_filename,
+                                mime="video/mp4",
+                                use_container_width=True
+                            )
                    else:
                        st.error(tr("status.video_not_found", path=result.video_path))
                    
--- a/web/i18n/locales/en_US.json
+++ b/web/i18n/locales/en_US.json
@@ -8,6 +8,8 @@
    "section.bgm": "🎵 Background Music",
    "section.tts": "🎤 Voiceover",
    "section.image": "🎨 Image Generation",
+    "section.video": "🎬 Video Generation",
+    "section.media": "🎨 Media Generation",
    "section.template": "📐 Storyboard Template",
    "section.video_generation": "🎬 Generate Video",
    
@@ -45,12 +47,10 @@
    "style.workflow": "Workflow Selection",
    "style.workflow_what": "Determines how each frame's illustration is generated and its effect (e.g., using FLUX, SD models)",
    "style.workflow_how": "Place the exported image_xxx.json workflow file(API format) into the workflows/selfhost/ folder (for local ComfyUI) or the workflows/runninghub/ folder (for cloud)",
-    "style.image_size": "Image Size",
-    "style.image_width": "Width",
-    "style.image_height": "Height",
-    "style.image_width_help": "Width of AI-generated images (Note: This is the image size, not the final video size. Video size is determined by the template)",
-    "style.image_height_help": "Height of AI-generated images (Note: This is the image size, not the final video size. Video size is determined by the template)",
-    "style.image_size_note": "Image size controls the dimensions of AI-generated illustrations, and does not affect the final video size. Video size is determined by the Storyboard Template below.",
+    "style.video_workflow_what": "Determines how each frame's video clip is generated and its effect (e.g., using different video generation models)",
+    "style.video_workflow_how": "Place the exported video_xxx.json workflow file(API format) into the workflows/selfhost/ folder (for local ComfyUI) or the workflows/runninghub/ folder (for cloud)",
+    "style.image_size_info": "Image Size: {width}x{height} (auto-determined by template)",
+    "style.video_size_info": "Video Size: {width}x{height} (auto-determined by template)",
    "style.prompt_prefix": "Prompt Prefix",
    "style.prompt_prefix_what": "Automatically added before all image prompts to control the illustration style uniformly (e.g., cartoon, realistic)",
    "style.prompt_prefix_how": "Enter style description in the input box below. To save permanently, edit the config.yaml file",
@@ -60,11 +60,16 @@
    "style.description": "Style Description",
    "style.description_placeholder": "Describe the illustration style you want (any language)...",
    "style.preview_title": "Preview Style",
+    "style.video_preview_title": "Preview Video",
    "style.test_prompt": "Test Prompt",
+    "style.test_video_prompt": "Test Video Prompt",
    "style.test_prompt_help": "Enter test prompt to preview style effect",
    "style.preview": "🖼️ Generate Preview",
+    "style.video_preview": "🎬 Generate Video Preview",
    "style.previewing": "Generating style preview...",
+    "style.video_previewing": "Generating video preview...",
    "style.preview_success": "✅ Preview generated successfully!",
+    "style.video_preview_success": "✅ Video preview generated successfully!",
    "style.preview_caption": "Style Preview",
    "style.preview_failed": "Preview failed: {error}",
    "style.preview_failed_general": "Failed to generate preview image",
@@ -81,8 +86,15 @@
    "template.modern": "Modern",
    "template.neon": "Neon",
    "template.what": "Controls the visual layout and design style of each frame (title, text, image arrangement)",
-    "template.how": "Place .html template files in templates/SIZE/ directories (e.g., templates/1080x1920/). Templates are automatically grouped by size. Custom CSS styles are supported.\n\n**Note**\n\nAt least one of the following browsers must be installed on your computer for proper operation:\n1. Google Chrome (Windows, macOS)\n2. Chromium Browser (Linux)\n3. Microsoft Edge",
+    "template.how": "Place .html template files in templates/SIZE/ directories (e.g., templates/1080x1920/). Templates are automatically grouped by size. Custom CSS styles are supported.\n\n**Template Naming Convention**\n\n- `static_*.html` → Static style templates (no AI-generated media)\n- `image_*.html` → Image generation templates (AI-generated images)\n- `video_*.html` → Video generation templates (AI-generated videos)\n\n**Note**\n\nAt least one of the following browsers must be installed on your computer for proper operation:\n1. Google Chrome (Windows, macOS)\n2. Chromium Browser (Linux)\n3. Microsoft Edge",
    "template.size_info": "Template Size",
+    "template.type_selector": "Template Type",
+    "template.type.static": "📄 Static Style",
+    "template.type.image": "🖼️ Generate Images",
+    "template.type.video": "🎬 Generate Videos",
+    "template.type.static_hint": "Uses template's built-in styles, no AI-generated media required. You can customize background images and other parameters in the template.",
+    "template.type.image_hint": "AI automatically generates illustrations matching the narration content. Image size is determined by the template.",
+    "template.type.video_hint": "AI automatically generates video clips matching the narration content. Video size is determined by the template.",
    
    "orientation.portrait": "Portrait",
    "orientation.landscape": "Landscape",
@@ -140,12 +152,16 @@
    "progress.generating_narrations": "Generating narrations...",
    "progress.splitting_script": "Splitting script...",
    "progress.generating_image_prompts": "Generating image prompts...",
+    "progress.generating_video_prompts": "Generating video prompts...",
+    "progress.preparing_frames": "Preparing frames...",
    "progress.frame": "Frame {current}/{total}",
    "progress.frame_step": "Frame {current}/{total} - Step {step}/4: {action}",
-    "progress.step_audio": "Generating audio...",
-    "progress.step_image": "Generating image...",
-    "progress.step_compose": "Composing frame...",
-    "progress.step_video": "Creating video segment...",
+    "progress.processing_frame": "Processing frame {current}/{total}...",
+    "progress.step_audio": "Generating audio",
+    "progress.step_image": "Generating image",
+    "progress.step_media": "Generating media",
+    "progress.step_compose": "Composing frame",
+    "progress.step_video": "Creating video segment",
    "progress.concatenating": "Concatenating video...",
    "progress.finalizing": "Finalizing...",
    "progress.completed": "✅ Completed",
--- a/web/i18n/locales/zh_CN.json
+++ b/web/i18n/locales/zh_CN.json
@@ -8,6 +8,8 @@
    "section.bgm": "🎵 背景音乐",
    "section.tts": "🎤 配音合成",
    "section.image": "🎨 插图生成",
+    "section.video": "🎬 视频生成",
+    "section.media": "🎨 媒体生成",
    "section.template": "📐 分镜模板",
    "section.video_generation": "🎬 生成视频",
    
@@ -45,12 +47,10 @@
    "style.workflow": "工作流选择",
    "style.workflow_what": "决定视频中每帧插图的生成方式和效果（如使用 FLUX、SD 等模型）",
    "style.workflow_how": "将导出的 image_xxx.json 工作流文件（API格式）放入 workflows/selfhost/（本地 ComfyUI）或 workflows/runninghub/（云端）文件夹",
-    "style.image_size": "图片尺寸",
-    "style.image_width": "宽度",
-    "style.image_height": "高度",
-    "style.image_width_help": "AI 生成插图的宽度（注意：这是插图尺寸，不是最终视频尺寸。视频尺寸由模板决定）",
-    "style.image_height_help": "AI 生成插图的高度（注意：这是插图尺寸，不是最终视频尺寸。视频尺寸由模板决定）",
-    "style.image_size_note": "图片尺寸控制 AI 生成的插图大小，不影响最终视频尺寸。视频尺寸由下方的「📐 分镜模板」决定。",
+    "style.video_workflow_what": "决定视频中每帧视频片段的生成方式和效果（如使用不同的视频生成模型）",
+    "style.video_workflow_how": "将导出的 video_xxx.json 工作流文件（API格式）放入 workflows/selfhost/（本地 ComfyUI）或 workflows/runninghub/（云端）文件夹",
+    "style.image_size_info": "插图尺寸：{width}x{height}（由模板自动决定）",
+    "style.video_size_info": "视频尺寸：{width}x{height}（由模板自动决定）",
    "style.prompt_prefix": "提示词前缀",
    "style.prompt_prefix_what": "自动添加到所有图片提示词前面，统一控制插图风格（如：卡通风格、写实风格等）",
    "style.prompt_prefix_how": "直接在下方输入框填写风格描述。若要永久保存，需编辑 config.yaml 文件",
@@ -60,11 +60,16 @@
    "style.description": "风格描述",
    "style.description_placeholder": "描述您想要的插图风格（任何语言）...",
    "style.preview_title": "预览风格",
+    "style.video_preview_title": "预览视频",
    "style.test_prompt": "测试提示词",
+    "style.test_video_prompt": "测试视频提示词",
    "style.test_prompt_help": "输入测试提示词来预览风格效果",
    "style.preview": "🖼️ 生成预览",
+    "style.video_preview": "🎬 生成视频预览",
    "style.previewing": "正在生成风格预览...",
+    "style.video_previewing": "正在生成视频预览...",
    "style.preview_success": "✅ 预览生成成功！",
+    "style.video_preview_success": "✅ 视频预览生成成功！",
    "style.preview_caption": "风格预览",
    "style.preview_failed": "预览失败：{error}",
    "style.preview_failed_general": "预览图片生成失败",
@@ -81,8 +86,15 @@
    "template.modern": "现代",
    "template.neon": "霓虹",
    "template.what": "控制视频每一帧的视觉布局和设计风格（标题、文本、图片的排版样式）",
-    "template.how": "将 .html 模板文件放入 templates/尺寸/ 目录（如 templates/1080x1920/），系统会自动按尺寸分组。支持自定义 CSS 样式。\n\n**注意**\n\n您的计算机上必须安装以下至少一种浏览器才能正常运行：\n1. Google Chrome（Windows、MacOS）\n2. Chromium 浏览器（Linux）\n3. Microsoft Edge",
+    "template.how": "将 .html 模板文件放入 templates/尺寸/ 目录（如 templates/1080x1920/），系统会自动按尺寸分组。支持自定义 CSS 样式。\n\n**模板命名规范**\n\n- `static_*.html` → 静态样式模板（无需AI生成媒体）\n- `image_*.html` → 生成插图模板（AI生成图片）\n- `video_*.html` → 生成视频模板（AI生成视频）\n\n**注意**\n\n您的计算机上必须安装以下至少一种浏览器才能正常运行：\n1. Google Chrome（Windows、MacOS）\n2. Chromium 浏览器（Linux）\n3. Microsoft Edge",
    "template.size_info": "模板尺寸",
+    "template.type_selector": "分镜类型",
+    "template.type.static": "📄 静态样式",
+    "template.type.image": "🖼️ 生成插图",
+    "template.type.video": "🎬 生成视频",
+    "template.type.static_hint": "使用模板自带样式，无需AI生成媒体。可在模板中自定义背景图片等参数。",
+    "template.type.image_hint": "AI自动根据文案内容生成与之匹配的插图，插图尺寸由模板决定。",
+    "template.type.video_hint": "AI自动根据文案内容生成与之匹配的视频片段，视频尺寸由模板决定。",
    
    "orientation.portrait": "竖屏",
    "orientation.landscape": "横屏",
@@ -140,12 +152,16 @@
    "progress.generating_narrations": "生成旁白...",
    "progress.splitting_script": "切分脚本...",
    "progress.generating_image_prompts": "生成图片提示词...",
+    "progress.generating_video_prompts": "生成视频提示词...",
+    "progress.preparing_frames": "准备分镜...",
    "progress.frame": "分镜 {current}/{total}",
    "progress.frame_step": "分镜 {current}/{total} - 步骤 {step}/4: {action}",
-    "progress.step_audio": "生成语音...",
-    "progress.step_image": "生成插图...",
-    "progress.step_compose": "合成画面...",
-    "progress.step_video": "创建视频片段...",
+    "progress.processing_frame": "处理分镜 {current}/{total}...",
+    "progress.step_audio": "生成语音",
+    "progress.step_image": "生成插图",
+    "progress.step_media": "生成媒体",
+    "progress.step_compose": "合成画面",
+    "progress.step_video": "创建视频片段",
    "progress.concatenating": "正在拼接视频...",
    "progress.finalizing": "完成中...",
    "progress.completed": "✅ 生成完成",
--- a/workflows/runninghub/video_wan2.1_fusionx.json
+++ b/workflows/runninghub/video_wan2.1_fusionx.json
@@ -0,0 +1,5 @@
+{
+  "source": "runninghub",
+  "workflow_id": "1985909483975188481"
+}
+
--- a/workflows/selfhost/video_wan2.1_fusionx.json
+++ b/workflows/selfhost/video_wan2.1_fusionx.json
@@ -0,0 +1,187 @@
+{
+  "3": {
+    "inputs": {
+      "seed": 576600626757621,
+      "steps": 10,
+      "cfg": 1,
+      "sampler_name": "uni_pc",
+      "scheduler": "normal",
+      "denoise": 1,
+      "model": [
+        "48",
+        0
+      ],
+      "positive": [
+        "6",
+        0
+      ],
+      "negative": [
+        "7",
+        0
+      ],
+      "latent_image": [
+        "40",
+        0
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "KSampler"
+    }
+  },
+  "6": {
+    "inputs": {
+      "text": [
+        "49",
+        0
+      ],
+      "clip": [
+        "38",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Positive Prompt)"
+    }
+  },
+  "7": {
+    "inputs": {
+      "text": "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+      "clip": [
+        "38",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Negative Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "3",
+        0
+      ],
+      "vae": [
+        "39",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "30": {
+    "inputs": {
+      "frame_rate": 16,
+      "loop_count": 0,
+      "filename_prefix": "Video",
+      "format": "video/h264-mp4",
+      "pix_fmt": "yuv420p",
+      "crf": 19,
+      "save_metadata": true,
+      "trim_to_audio": false,
+      "pingpong": false,
+      "save_output": true,
+      "images": [
+        "8",
+        0
+      ]
+    },
+    "class_type": "VHS_VideoCombine",
+    "_meta": {
+      "title": "Video Combine 🎥🅥🅗🅢"
+    }
+  },
+  "37": {
+    "inputs": {
+      "unet_name": "wan-fusionx/WanT2V_MasterModel.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "38": {
+    "inputs": {
+      "clip_name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+      "type": "wan",
+      "device": "default"
+    },
+    "class_type": "CLIPLoader",
+    "_meta": {
+      "title": "Load CLIP"
+    }
+  },
+  "39": {
+    "inputs": {
+      "vae_name": "wan_2.1_vae.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  },
+  "40": {
+    "inputs": {
+      "width": [
+        "50",
+        0
+      ],
+      "height": [
+        "51",
+        0
+      ],
+      "length": 81,
+      "batch_size": 1
+    },
+    "class_type": "EmptyHunyuanLatentVideo",
+    "_meta": {
+      "title": "EmptyHunyuanLatentVideo"
+    }
+  },
+  "48": {
+    "inputs": {
+      "shift": 1,
+      "model": [
+        "37",
+        0
+      ]
+    },
+    "class_type": "ModelSamplingSD3",
+    "_meta": {
+      "title": "Shift"
+    }
+  },
+  "49": {
+    "inputs": {
+      "value": "草地上有个小狗在奔跑"
+    },
+    "class_type": "PrimitiveStringMultiline",
+    "_meta": {
+      "title": "$prompt.value!"
+    }
+  },
+  "50": {
+    "inputs": {
+      "value": 512
+    },
+    "class_type": "easy int",
+    "_meta": {
+      "title": "$width.value"
+    }
+  },
+  "51": {
+    "inputs": {
+      "value": 288
+    },
+    "class_type": "easy int",
+    "_meta": {
+      "title": "$height.value"
+    }
+  }
+}