分镜支持视频功能

2025-11-11 20:38:31 +08:00
parent cf9321feac
commit 0e2b6b17d0
17 changed files with 1225 additions and 321 deletions
--- a/api/routers/image.py
+++ b/api/routers/image.py
@@ -43,18 +43,27 @@ async def image_generate(
    try:
        logger.info(f"Image generation request: {request.prompt[:50]}...")
-        # Call image service
+        # Call media service (backward compatible with image API)
-        image_path = await pixelle_video.image(
+        media_result = await pixelle_video.media(
            prompt=request.prompt,
            width=request.width,
            height=request.height,
            workflow=request.workflow
        )
-        return ImageGenerateResponse(
+        # For backward compatibility, only support image results in /image endpoint
-            image_path=image_path
+        if media_result.is_video:
            raise HTTPException(
                status_code=400,
                detail="Video workflow used. Please use /media/generate endpoint for video generation."
            )
        return ImageGenerateResponse(
            image_path=media_result.url
        )
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Image generation error: {e}")
        raise HTTPException(status_code=500, detail=str(e))
--- a/pixelle_video/models/media.py
+++ b/pixelle_video/models/media.py
@@ -0,0 +1,61 @@
 # Copyright (C) 2025 AIDC-AI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Media generation result models
 """
 from typing import Literal, Optional
 from pydantic import BaseModel, Field
 class MediaResult(BaseModel):
    """
    Media generation result from workflow execution
    Supports both image and video outputs from ComfyUI workflows.
    The media_type indicates what kind of media was generated.
    Attributes:
        media_type: Type of media generated ("image" or "video")
        url: URL or path to the generated media
        duration: Duration in seconds (only for video, None for image)
    Examples:
        # Image result
        MediaResult(media_type="image", url="http://example.com/image.png")
        # Video result
        MediaResult(media_type="video", url="http://example.com/video.mp4", duration=5.2)
    """
    media_type: Literal["image", "video"] = Field(
        description="Type of generated media"
    )
    url: str = Field(
        description="URL or path to the generated media file"
    )
    duration: Optional[float] = Field(
        None,
        description="Duration in seconds (only applicable for video)"
    )
    @property
    def is_image(self) -> bool:
        """Check if this is an image result"""
        return self.media_type == "image"
    @property
    def is_video(self) -> bool:
        """Check if this is a video result"""
        return self.media_type == "video"
--- a/pixelle_video/models/storyboard.py
+++ b/pixelle_video/models/storyboard.py
@@ -57,16 +57,18 @@ class StoryboardFrame:
    """Single storyboard frame"""
    index: int                                 # Frame index (0-based)
    narration: str                             # Narration text
-    image_prompt: str                          # Image generation prompt
+    image_prompt: str                          # Image generation prompt (can be None for text-only or video)
    # Generated resource paths
-    audio_path: Optional[str] = None           # Audio file path
+    audio_path: Optional[str] = None           # Audio file path (narration)
-    image_path: Optional[str] = None           # Original image path
+    media_type: Optional[str] = None           # Media type: "image" or "video" (None if no media)
-    composed_image_path: Optional[str] = None  # Composed image path (with subtitles)
+    image_path: Optional[str] = None           # Original image path (for image type)
-    video_segment_path: Optional[str] = None   # Video segment path
+    video_path: Optional[str] = None           # Original video path (for video type, before composition)
    composed_image_path: Optional[str] = None  # Composed image path (with subtitles, for image type)
    video_segment_path: Optional[str] = None   # Final video segment path
    # Metadata
-    duration: float = 0.0                      # Audio duration (seconds)
+    duration: float = 0.0                      # Frame duration (seconds, from audio or video)
    created_at: Optional[datetime] = None
    def __post_init__(self):
--- a/pixelle_video/pipelines/base.py
+++ b/pixelle_video/pipelines/base.py
@@ -63,9 +63,12 @@ class BasePipeline(ABC):
        # Quick access to services (convenience)
        self.llm = pixelle_video_core.llm
        self.tts = pixelle_video_core.tts
-        self.image = pixelle_video_core.image
+        self.media = pixelle_video_core.media
        self.video = pixelle_video_core.video
        # Backward compatibility alias
        self.image = pixelle_video_core.media
    @abstractmethod
    async def __call__(
        self,
--- a/pixelle_video/pipelines/standard.py
+++ b/pixelle_video/pipelines/standard.py
@@ -269,11 +269,13 @@ class StandardPipeline(BasePipeline):
        )
        # ========== Step 0.8: Check template requirements ==========
-        template_requires_image = self._check_template_requires_image(config.frame_template)
+        template_media_type = self._check_template_media_type(config.frame_template)
-        if template_requires_image:
+        if template_media_type == "video":
            logger.info(f"🎬 Template requires video generation")
        elif template_media_type == "image":
            logger.info(f"📸 Template requires image generation")
-        else:
+        else:  # text
-            logger.info(f"⚡ Template does not require images - skipping image generation pipeline")
+            logger.info(f"⚡ Template does not require media - skipping media generation pipeline")
            logger.info(f"   💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency")
        try:
@@ -294,8 +296,61 @@ class StandardPipeline(BasePipeline):
                logger.info(f"✅ Split script into {len(narrations)} segments (by lines)")
                logger.info(f"   Note: n_scenes={n_scenes} is ignored in fixed mode")
-            # ========== Step 2: Generate image prompts (conditional) ==========
+            # ========== Step 2: Generate media prompts (conditional) ==========
-            if template_requires_image:
+            if template_media_type == "video":
                # Video template: generate video prompts
                self._report_progress(progress_callback, "generating_video_prompts", 0.15)
                from pixelle_video.utils.content_generators import generate_video_prompts
                # Override prompt_prefix if provided
                original_prefix = None
                if prompt_prefix is not None:
                    image_config = self.core.config.get("comfyui", {}).get("image", {})
                    original_prefix = image_config.get("prompt_prefix")
                    image_config["prompt_prefix"] = prompt_prefix
                    logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")
                try:
                    # Create progress callback wrapper for video prompt generation
                    def video_prompt_progress(completed: int, total: int, message: str):
                        batch_progress = completed / total if total > 0 else 0
                        overall_progress = 0.15 + (batch_progress * 0.15)
                        self._report_progress(
                            progress_callback,
                            "generating_video_prompts",
                            overall_progress,
                            extra_info=message
                        )
                    # Generate base video prompts
                    base_image_prompts = await generate_video_prompts(
                        self.llm,
                        narrations=narrations,
                        min_words=min_image_prompt_words,
                        max_words=max_image_prompt_words,
                        progress_callback=video_prompt_progress
                    )
                    # Apply prompt prefix
                    from pixelle_video.utils.prompt_helper import build_image_prompt
                    image_config = self.core.config.get("comfyui", {}).get("image", {})
                    prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "")
                    image_prompts = []
                    for base_prompt in base_image_prompts:
                        final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use)
                        image_prompts.append(final_prompt)
                finally:
                    # Restore original prompt_prefix
                    if original_prefix is not None:
                        image_config["prompt_prefix"] = original_prefix
                logger.info(f"✅ Generated {len(image_prompts)} video prompts")
            elif template_media_type == "image":
                # Image template: generate image prompts
                self._report_progress(progress_callback, "generating_image_prompts", 0.15)
                # Override prompt_prefix if provided
@@ -343,12 +398,13 @@ class StandardPipeline(BasePipeline):
                        image_config["prompt_prefix"] = original_prefix
                logger.info(f"✅ Generated {len(image_prompts)} image prompts")
-            else:
+            
-                # Skip image prompt generation
+            else:  # text
                # Text-only template: skip media prompt generation
                image_prompts = [None] * len(narrations)
                self._report_progress(progress_callback, "preparing_frames", 0.15)
-                logger.info(f"⚡ Skipped image prompt generation (template doesn't need images)")
+                logger.info(f"⚡ Skipped media prompt generation (text-only template)")
-                logger.info(f"   💡 Savings: {len(narrations)} LLM calls + {len(narrations)} image generations")
+                logger.info(f"   💡 Savings: {len(narrations)} LLM calls + {len(narrations)} media generations")
            # ========== Step 3: Create frames ==========
            for i, (narration, image_prompt) in enumerate(zip(narrations, image_prompts)):
@@ -452,29 +508,44 @@ class StandardPipeline(BasePipeline):
            logger.error(f"❌ Video generation failed: {e}")
            raise
-    def _check_template_requires_image(self, frame_template: str) -> bool:
+    def _check_template_media_type(self, frame_template: str) -> str:
        """
-        Check if template requires image generation
+        Check template media type requirement
        This is checked at pipeline level to avoid unnecessary:
-        - LLM calls (generating image_prompts)
+        - LLM calls (generating media prompts)
-        - Image generation API calls
+        - Media generation API calls
        - ComfyUI dependency
        Template naming rules:
        - video_*.html: Video template (returns "video")
        - Other templates with {{image}}: Image template (returns "image")
        - Other templates without {{image}}: Text-only template (returns "text")
        Args:
-            frame_template: Template path (e.g., "1080x1920/default.html")
+            frame_template: Template path (e.g., "1080x1920/default.html" or "1080x1920/video_default.html")
        Returns:
-            True if template contains {{image}}, False otherwise
+            "video", "image", or "text"
        """
        from pixelle_video.services.frame_html import HTMLFrameGenerator
        from pixelle_video.utils.template_util import resolve_template_path
        # Check if template name starts with video_
        template_name = Path(frame_template).name
        if template_name.startswith("video_"):
            logger.debug(f"Template '{frame_template}' is video template (video_ prefix)")
            return "video"
        # Check if template contains {{image}}
        template_path = resolve_template_path(frame_template)
        generator = HTMLFrameGenerator(template_path)
-        requires = generator.requires_image()
+        requires_image = generator.requires_image()
-        logger.debug(f"Template '{frame_template}' requires_image={requires}")
+        if requires_image:
-        
+            logger.debug(f"Template '{frame_template}' is image template (has {{image}})")
-        return requires
+            return "image"
        else:
            logger.debug(f"Template '{frame_template}' is text-only template")
            return "text"
--- a/pixelle_video/prompts/video_generation.py
+++ b/pixelle_video/prompts/video_generation.py
@@ -0,0 +1,133 @@
 # Copyright (C) 2025 AIDC-AI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Video prompt generation template
 For generating video prompts from narrations.
 """
 import json
 from typing import List
 VIDEO_PROMPT_GENERATION_PROMPT = """# 角色定位
 你是一个专业的视频创意设计师，擅长为视频脚本创作富有动感和表现力的视频生成提示词，将叙述内容转化为生动的视频画面。
 # 核心任务
 基于已有的视频脚本，为每个分镜的"旁白内容"创作对应的**英文**视频生成提示词，确保视频画面与叙述内容完美配合，通过动态画面增强观众的理解和记忆。
 **重要：输入包含 {narrations_count} 个旁白，你必须为每个旁白都生成一个对应的视频提示词，总共输出 {narrations_count} 个视频提示词。**
 # 输入内容
 {narrations_json}
 # 输出要求
 ## 视频提示词规范
 - 语言：**必须使用英文**（用于 AI 视频生成模型）
 - 描述结构：scene + character action + camera movement + emotion + atmosphere
 - 描述长度：确保描述清晰完整且富有创意（建议 50-100 个英文单词）
 - 动态元素：强调动作、运动、变化等动态效果
 ## 视觉创意要求
 - 每个视频都要准确反映对应旁白的具体内容和情感
 - 突出画面的动态性：角色动作、物体运动、镜头移动、场景转换等
 - 使用象征手法将抽象概念视觉化（如用流动的水代表时间流逝，用上升的阶梯代表进步等）
 - 画面要表现出丰富的情感和动作，增强视觉冲击力
 - 通过镜头语言（推拉摇移）和剪辑节奏增强表现力
 ## 关键英文词汇参考
 - 动作：moving, running, flowing, transforming, growing, falling
 - 镜头：camera pan, zoom in, zoom out, tracking shot, aerial view
 - 转场：transition, fade in, fade out, dissolve
 - 氛围：dynamic, energetic, peaceful, dramatic, mysterious
 - 光影：lighting changes, shadows moving, sunlight streaming
 ## 视频与文案配合原则
 - 视频要服务于文案，成为文案内容的视觉延伸
 - 避免与文案内容无关或矛盾的视觉元素
 - 选择最能增强文案说服力的动态表现方式
 - 确保观众能通过视频动态快速理解文案的核心观点
 ## 创意指导
 1. **现象描述类文案**：用动态场景表现社会现象的发生过程
 2. **原因分析类文案**：用因果关系的动态演变表现内在逻辑
 3. **影响论证类文案**：用后果场景的动态展开或对比表现影响程度
 4. **深入探讨类文案**：用抽象概念的动态具象化表现深刻思考
 5. **结论启发类文案**：用开放式动态场景或指引性运动表现启发性
 ## 视频特有注意事项
 - 强调动态：每个视频都应该包含明显的动作或运动
 - 镜头语言：适当使用推拉摇移等镜头技巧增强表现力
 - 时长考虑：视频应该是连贯的动态过程，不是静态画面
 - 流畅性：注意动作的流畅性和自然性
 # 输出格式
 严格按照以下JSON格式输出，**视频提示词必须是英文**：
 ```json
 {{
  "video_prompts": [
    "[detailed English video prompt with dynamic elements and camera movements]",
    "[detailed English video prompt with dynamic elements and camera movements]"
  ]
 }}
 ```
 # 重要提醒
 1. 只输出JSON格式内容，不要添加任何解释说明
 2. 确保JSON格式严格正确，可以被程序直接解析
 3. 输入是 {{"narrations": [旁白数组]}} 格式，输出是 {{"video_prompts": [视频提示词数组]}} 格式
 4. **输出的video_prompts数组必须恰好包含 {narrations_count} 个元素，与输入的narrations数组一一对应**
 5. **视频提示词必须使用英文**（for AI video generation models）
 6. 视频提示词必须准确反映对应旁白的具体内容和情感
 7. 每个视频都要强调动态性和运动感，避免静态描述
 8. 适当使用镜头语言增强表现力
 9. 确保视频画面能增强文案的说服力和观众的理解度
 现在，请为上述 {narrations_count} 个旁白创作对应的 {narrations_count} 个**英文**视频提示词。只输出JSON，不要其他内容。
 """
 def build_video_prompt_prompt(
    narrations: List[str],
    min_words: int,
    max_words: int
 ) -> str:
    """
    Build video prompt generation prompt
    Args:
        narrations: List of narrations
        min_words: Minimum word count
        max_words: Maximum word count
    Returns:
        Formatted prompt for LLM
    Example:
        >>> build_video_prompt_prompt(narrations, 50, 100)
    """
    narrations_json = json.dumps(
        {"narrations": narrations},
        ensure_ascii=False,
        indent=2
    )
    return VIDEO_PROMPT_GENERATION_PROMPT.format(
        narrations_json=narrations_json,
        narrations_count=len(narrations),
        min_words=min_words,
        max_words=max_words
    )
--- a/pixelle_video/service.py
+++ b/pixelle_video/service.py
@@ -23,7 +23,7 @@ from loguru import logger
 from pixelle_video.config import config_manager
 from pixelle_video.services.llm_service import LLMService
 from pixelle_video.services.tts_service import TTSService
-from pixelle_video.services.image import ImageService
+from pixelle_video.services.media import MediaService
 from pixelle_video.services.video import VideoService
 from pixelle_video.services.frame_processor import FrameProcessor
 from pixelle_video.pipelines.standard import StandardPipeline
@@ -45,7 +45,7 @@ class PixelleVideoCore:
        # Use capabilities directly
        answer = await pixelle_video.llm("Explain atomic habits")
        audio = await pixelle_video.tts("Hello world")
-        image = await pixelle_video.image(prompt="a cat")
+        media = await pixelle_video.media(prompt="a cat")
        # Check active capabilities
        print(f"Using LLM: {pixelle_video.llm.active}")
@@ -56,7 +56,7 @@ class PixelleVideoCore:
          ├── config (configuration)
          ├── llm (LLM service - direct OpenAI SDK)
          ├── tts (TTS service - ComfyKit workflows)
-          ├── image (Image service - ComfyKit workflows)
+          ├── media (Media service - ComfyKit workflows, supports image & video)
          └── pipelines (video generation pipelines)
              ├── standard (standard workflow)
              ├── custom (custom workflow template)
@@ -77,7 +77,7 @@ class PixelleVideoCore:
        # Core services (initialized in initialize())
        self.llm: Optional[LLMService] = None
        self.tts: Optional[TTSService] = None
-        self.image: Optional[ImageService] = None
+        self.media: Optional[MediaService] = None
        self.video: Optional[VideoService] = None
        self.frame_processor: Optional[FrameProcessor] = None
@@ -105,7 +105,7 @@ class PixelleVideoCore:
        # 1. Initialize core services
        self.llm = LLMService(self.config)
        self.tts = TTSService(self.config)
-        self.image = ImageService(self.config)
+        self.media = MediaService(self.config)
        self.video = VideoService()
        self.frame_processor = FrameProcessor(self)
--- a/pixelle_video/services/init.py
+++ b/pixelle_video/services/init.py
@@ -18,7 +18,7 @@ Core services providing atomic capabilities.
 Services:
 - LLMService: LLM text generation
 - TTSService: Text-to-speech
- ImageService: Image generation
+- MediaService: Media generation (image & video)
 - VideoService: Video processing
 - FrameProcessor: Frame processing orchestrator
 - ComfyBaseService: Base class for ComfyUI-based services
@@ -27,15 +27,19 @@ Services:
 from pixelle_video.services.comfy_base_service import ComfyBaseService
 from pixelle_video.services.llm_service import LLMService
 from pixelle_video.services.tts_service import TTSService
-from pixelle_video.services.image import ImageService
+from pixelle_video.services.media import MediaService
 from pixelle_video.services.video import VideoService
 from pixelle_video.services.frame_processor import FrameProcessor
 # Backward compatibility alias
 ImageService = MediaService
 __all__ = [
    "ComfyBaseService",
    "LLMService",
    "TTSService",
-    "ImageService",
+    "MediaService",
    "ImageService",  # Backward compatibility
    "VideoService",
    "FrameProcessor",
 ]
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -84,7 +84,7 @@ class FrameProcessor:
                ))
            await self._step_generate_audio(frame, config)
-            # Step 2: Generate image (conditional)
+            # Step 2: Generate media (image or video, conditional)
            if needs_image:
                if progress_callback:
                    progress_callback(ProgressEvent(
@@ -93,12 +93,13 @@ class FrameProcessor:
                        frame_current=frame_num,
                        frame_total=total_frames,
                        step=2,
-                        action="image"
+                        action="media"
                    ))
-                await self._step_generate_image(frame, config)
+                await self._step_generate_media(frame, config)
            else:
                frame.image_path = None
-                logger.debug(f"  2/4: Skipped image generation (not required by template)")
+                frame.media_type = None
                logger.debug(f"  2/4: Skipped media generation (not required by template)")
            # Step 3: Compose frame (add subtitle)
            if progress_callback:
@@ -176,28 +177,67 @@ class FrameProcessor:
        logger.debug(f"  ✓ Audio generated: {audio_path} ({frame.duration:.2f}s)")
-    async def _step_generate_image(
+    async def _step_generate_media(
        self,
        frame: StoryboardFrame,
        config: StoryboardConfig
    ):
-        """Step 2: Generate image using ComfyKit"""
+        """Step 2: Generate media (image or video) using ComfyKit"""
-        logger.debug(f"  2/4: Generating image for frame {frame.index}...")
+        logger.debug(f"  2/4: Generating media for frame {frame.index}...")
-        # Call Image generation (with optional preset)
+        # Determine media type based on workflow
-        image_url = await self.core.image(
+        # video_ prefix in workflow name indicates video generation
        workflow_name = config.image_workflow or ""
        is_video_workflow = "video_" in workflow_name.lower()
        media_type = "video" if is_video_workflow else "image"
        logger.debug(f"  → Media type: {media_type} (workflow: {workflow_name})")
        # Call Media generation (with optional preset)
        media_result = await self.core.media(
            prompt=frame.image_prompt,
            workflow=config.image_workflow,  # Pass workflow from config (None = use default)
            media_type=media_type,
            width=config.image_width,
            height=config.image_height
        )
-        # Download image to local (pass task_id)
+        # Store media type
-        local_path = await self._download_image(image_url, frame.index, config.task_id)
+        frame.media_type = media_result.media_type
        frame.image_path = local_path
        if media_result.is_image:
            # Download image to local (pass task_id)
            local_path = await self._download_media(
                media_result.url,
                frame.index,
                config.task_id,
                media_type="image"
            )
            frame.image_path = local_path
            logger.debug(f"  ✓ Image generated: {local_path}")
        elif media_result.is_video:
            # Download video to local (pass task_id)
            local_path = await self._download_media(
                media_result.url,
                frame.index,
                config.task_id,
                media_type="video"
            )
            frame.video_path = local_path
            # Update duration from video if available
            if media_result.duration:
                frame.duration = media_result.duration
                logger.debug(f"  ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)")
            else:
                # Get video duration from file
                frame.duration = await self._get_video_duration(local_path)
                logger.debug(f"  ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)")
        else:
            raise ValueError(f"Unknown media type: {media_result.media_type}")
    async def _step_compose_frame(
        self,
        frame: StoryboardFrame,
@@ -211,7 +251,9 @@ class FrameProcessor:
        from pixelle_video.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "composed")
-        # Use HTML template to compose frame
+        # For video type: render HTML as transparent overlay image
        # For image type: render HTML with image background
        # In both cases, we need the composed image
        composed_path = await self._compose_frame_html(frame, storyboard, config, output_path)
        frame.composed_image_path = composed_path
@@ -264,17 +306,51 @@ class FrameProcessor:
        frame: StoryboardFrame,
        config: StoryboardConfig
    ):
-        """Step 4: Create video segment from image + audio"""
+        """Step 4: Create video segment from media + audio"""
        logger.debug(f"  4/4: Creating video segment for frame {frame.index}...")
        # Generate output path using task_id
        from pixelle_video.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "segment")
        # Call video compositor to create video from image + audio
        from pixelle_video.services.video import VideoService
        video_service = VideoService()
        # Branch based on media type
        if frame.media_type == "video":
            # Video workflow: overlay HTML template on video, then add audio
            logger.debug(f"  → Using video-based composition with HTML overlay")
            # Step 1: Overlay transparent HTML image on video
            # The composed_image_path contains the rendered HTML with transparent background
            temp_video_with_overlay = get_task_frame_path(config.task_id, frame.index, "video") + "_overlay.mp4"
            video_service.overlay_image_on_video(
                video=frame.video_path,
                overlay_image=frame.composed_image_path,
                output=temp_video_with_overlay,
                scale_mode="contain"  # Scale video to fit template size (contain mode)
            )
            # Step 2: Add narration audio to the overlaid video
            # Note: The video might have audio (replaced) or be silent (audio added)
            segment_path = video_service.merge_audio_video(
                video=temp_video_with_overlay,
                audio=frame.audio_path,
                output=output_path,
                replace_audio=True,  # Replace video audio with narration
                audio_volume=1.0
            )
            # Clean up temp file
            import os
            if os.path.exists(temp_video_with_overlay):
                os.unlink(temp_video_with_overlay)
        elif frame.media_type == "image" or frame.media_type is None:
            # Image workflow: create video from image + audio
            logger.debug(f"  → Using image-based composition")
            segment_path = video_service.create_video_from_image(
                image=frame.composed_image_path,
                audio=frame.audio_path,
@@ -282,6 +358,9 @@ class FrameProcessor:
                fps=config.video_fps
            )
        else:
            raise ValueError(f"Unknown media type: {frame.media_type}")
        frame.video_segment_path = segment_path
        logger.debug(f"  ✓ Video segment created: {segment_path}")
@@ -303,10 +382,16 @@ class FrameProcessor:
            estimated_duration = file_size / 2000
            return max(1.0, estimated_duration)  # At least 1 second
-    async def _download_image(self, url: str, frame_index: int, task_id: str) -> str:
+    async def _download_media(
-        """Download image from URL to local file"""
+        self,
        url: str,
        frame_index: int,
        task_id: str,
        media_type: str
    ) -> str:
        """Download media (image or video) from URL to local file"""
        from pixelle_video.utils.os_util import get_task_frame_path
-        output_path = get_task_frame_path(task_id, frame_index, "image")
+        output_path = get_task_frame_path(task_id, frame_index, media_type)
        async with httpx.AsyncClient() as client:
            response = await client.get(url)
@@ -317,3 +402,15 @@ class FrameProcessor:
        return output_path
    async def _get_video_duration(self, video_path: str) -> float:
        """Get video duration in seconds"""
        try:
            import ffmpeg
            probe = ffmpeg.probe(video_path)
            duration = float(probe['format']['duration'])
            return duration
        except Exception as e:
            logger.warning(f"Failed to get video duration: {e}, using audio duration")
            # Fallback: use audio duration if available
            return 1.0  # Default to 1 second if unable to determine
--- a/pixelle_video/services/image.py
+++ b/pixelle_video/services/image.py
@@ -1,192 +0,0 @@
 # Copyright (C) 2025 AIDC-AI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Image Generation Service - ComfyUI Workflow-based implementation
 """
 from typing import Optional
 from comfykit import ComfyKit
 from loguru import logger
 from pixelle_video.services.comfy_base_service import ComfyBaseService
 class ImageService(ComfyBaseService):
    """
    Image generation service - Workflow-based
    Uses ComfyKit to execute image generation workflows.
    Usage:
        # Use default workflow (workflows/image_flux.json)
        image_url = await pixelle_video.image(prompt="a cat")
        # Use specific workflow
        image_url = await pixelle_video.image(
            prompt="a cat",
            workflow="image_flux.json"
        )
        # List available workflows
        workflows = pixelle_video.image.list_workflows()
    """
    WORKFLOW_PREFIX = "image_"
    DEFAULT_WORKFLOW = None  # No hardcoded default, must be configured
    WORKFLOWS_DIR = "workflows"
    def __init__(self, config: dict):
        """
        Initialize image service
        Args:
            config: Full application config dict
        """
        super().__init__(config, service_name="image")
    async def __call__(
        self,
        prompt: str,
        workflow: Optional[str] = None,
        # ComfyUI connection (optional overrides)
        comfyui_url: Optional[str] = None,
        runninghub_api_key: Optional[str] = None,
        # Common workflow parameters
        width: Optional[int] = None,
        height: Optional[int] = None,
        negative_prompt: Optional[str] = None,
        steps: Optional[int] = None,
        seed: Optional[int] = None,
        cfg: Optional[float] = None,
        sampler: Optional[str] = None,
        **params
    ) -> str:
        """
        Generate image using workflow
        Args:
            prompt: Image generation prompt
            workflow: Workflow filename (default: from config or "image_flux.json")
            comfyui_url: ComfyUI URL (optional, overrides config)
            runninghub_api_key: RunningHub API key (optional, overrides config)
            width: Image width
            height: Image height
            negative_prompt: Negative prompt
            steps: Sampling steps
            seed: Random seed
            cfg: CFG scale
            sampler: Sampler name
            **params: Additional workflow parameters
        Returns:
            Generated image URL/path
        Examples:
            # Simplest: use default workflow (workflows/image_flux.json)
            image_url = await pixelle_video.image(prompt="a beautiful cat")
            # Use specific workflow
            image_url = await pixelle_video.image(
                prompt="a cat",
                workflow="image_flux.json"
            )
            # With additional parameters
            image_url = await pixelle_video.image(
                prompt="a cat",
                workflow="image_flux.json",
                width=1024,
                height=1024,
                steps=20,
                seed=42
            )
            # With absolute path
            image_url = await pixelle_video.image(
                prompt="a cat",
                workflow="/path/to/custom.json"
            )
            # With custom ComfyUI server
            image_url = await pixelle_video.image(
                prompt="a cat",
                comfyui_url="http://192.168.1.100:8188"
            )
        """
        # 1. Resolve workflow (returns structured info)
        workflow_info = self._resolve_workflow(workflow=workflow)
        # 2. Prepare ComfyKit config (supports both selfhost and runninghub)
        kit_config = self._prepare_comfykit_config(
            comfyui_url=comfyui_url,
            runninghub_api_key=runninghub_api_key
        )
        # 3. Build workflow parameters
        workflow_params = {"prompt": prompt}
        # Add optional parameters
        if width is not None:
            workflow_params["width"] = width
        if height is not None:
            workflow_params["height"] = height
        if negative_prompt is not None:
            workflow_params["negative_prompt"] = negative_prompt
        if steps is not None:
            workflow_params["steps"] = steps
        if seed is not None:
            workflow_params["seed"] = seed
        if cfg is not None:
            workflow_params["cfg"] = cfg
        if sampler is not None:
            workflow_params["sampler"] = sampler
        # Add any additional parameters
        workflow_params.update(params)
        logger.debug(f"Workflow parameters: {workflow_params}")
        # 4. Execute workflow (ComfyKit auto-detects based on input type)
        try:
            kit = ComfyKit(**kit_config)
            # Determine what to pass to ComfyKit based on source
            if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
                # RunningHub: pass workflow_id (ComfyKit will use runninghub backend)
                workflow_input = workflow_info["workflow_id"]
                logger.info(f"Executing RunningHub workflow: {workflow_input}")
            else:
                # Selfhost: pass file path (ComfyKit will use local ComfyUI)
                workflow_input = workflow_info["path"]
                logger.info(f"Executing selfhost workflow: {workflow_input}")
            result = await kit.execute(workflow_input, workflow_params)
            # 5. Handle result
            if result.status != "completed":
                error_msg = result.msg or "Unknown error"
                logger.error(f"Image generation failed: {error_msg}")
                raise Exception(f"Image generation failed: {error_msg}")
            if not result.images:
                logger.error("No images generated")
                raise Exception("No images generated")
            image_url = result.images[0]
            logger.info(f"✅ Generated image: {image_url}")
            return image_url
        except Exception as e:
            logger.error(f"Image generation error: {e}")
            raise
--- a/pixelle_video/services/media.py
+++ b/pixelle_video/services/media.py
@@ -0,0 +1,285 @@
 # Copyright (C) 2025 AIDC-AI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #     http://www.apache.org/licenses/LICENSE-2.0
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Media Generation Service - ComfyUI Workflow-based implementation
 Supports both image and video generation workflows.
 Automatically detects output type based on ExecuteResult.
 """
 from typing import Optional
 from comfykit import ComfyKit
 from loguru import logger
 from pixelle_video.services.comfy_base_service import ComfyBaseService
 from pixelle_video.models.media import MediaResult
 class MediaService(ComfyBaseService):
    """
    Media generation service - Workflow-based
    Uses ComfyKit to execute image/video generation workflows.
    Supports both image_ and video_ workflow prefixes.
    Usage:
        # Use default workflow (workflows/image_flux.json)
        media = await pixelle_video.media(prompt="a cat")
        if media.is_image:
            print(f"Generated image: {media.url}")
        elif media.is_video:
            print(f"Generated video: {media.url} ({media.duration}s)")
        # Use specific workflow
        media = await pixelle_video.media(
            prompt="a cat",
            workflow="image_flux.json"
        )
        # List available workflows
        workflows = pixelle_video.media.list_workflows()
    """
    WORKFLOW_PREFIX = ""  # Will be overridden by _scan_workflows
    DEFAULT_WORKFLOW = None  # No hardcoded default, must be configured
    WORKFLOWS_DIR = "workflows"
    def __init__(self, config: dict):
        """
        Initialize media service
        Args:
            config: Full application config dict
        """
        super().__init__(config, service_name="image")  # Keep "image" for config compatibility
    def _scan_workflows(self):
        """
        Scan workflows for both image_ and video_ prefixes
        Override parent method to support multiple prefixes
        """
        from pixelle_video.utils.os_util import list_resource_dirs, list_resource_files, get_resource_path
        from pathlib import Path
        workflows = []
        # Get all workflow source directories
        source_dirs = list_resource_dirs("workflows")
        if not source_dirs:
            logger.warning("No workflow source directories found")
            return workflows
        # Scan each source directory for workflow files
        for source_name in source_dirs:
            # Get all JSON files for this source
            workflow_files = list_resource_files("workflows", source_name)
            # Filter to only files matching image_ or video_ prefix
            matching_files = [
                f for f in workflow_files 
                if (f.startswith("image_") or f.startswith("video_")) and f.endswith('.json')
            ]
            for filename in matching_files:
                try:
                    # Get actual file path
                    file_path = Path(get_resource_path("workflows", source_name, filename))
                    workflow_info = self._parse_workflow_file(file_path, source_name)
                    workflows.append(workflow_info)
                    logger.debug(f"Found workflow: {workflow_info['key']}")
                except Exception as e:
                    logger.error(f"Failed to parse workflow {source_name}/{filename}: {e}")
        # Sort by key (source/name)
        return sorted(workflows, key=lambda w: w["key"])
    async def __call__(
        self,
        prompt: str,
        workflow: Optional[str] = None,
        # Media type specification (required for proper handling)
        media_type: str = "image",  # "image" or "video"
        # ComfyUI connection (optional overrides)
        comfyui_url: Optional[str] = None,
        runninghub_api_key: Optional[str] = None,
        # Common workflow parameters
        width: Optional[int] = None,
        height: Optional[int] = None,
        negative_prompt: Optional[str] = None,
        steps: Optional[int] = None,
        seed: Optional[int] = None,
        cfg: Optional[float] = None,
        sampler: Optional[str] = None,
        **params
    ) -> MediaResult:
        """
        Generate media (image or video) using workflow
        Media type must be specified explicitly via media_type parameter.
        Returns a MediaResult object containing media type and URL.
        Args:
            prompt: Media generation prompt
            workflow: Workflow filename (default: from config or "image_flux.json")
            media_type: Type of media to generate - "image" or "video" (default: "image")
            comfyui_url: ComfyUI URL (optional, overrides config)
            runninghub_api_key: RunningHub API key (optional, overrides config)
            width: Media width
            height: Media height
            negative_prompt: Negative prompt
            steps: Sampling steps
            seed: Random seed
            cfg: CFG scale
            sampler: Sampler name
            **params: Additional workflow parameters
        Returns:
            MediaResult object with media_type ("image" or "video") and url
        Examples:
            # Simplest: use default workflow (workflows/image_flux.json)
            media = await pixelle_video.media(prompt="a beautiful cat")
            if media.is_image:
                print(f"Image: {media.url}")
            # Use specific workflow
            media = await pixelle_video.media(
                prompt="a cat",
                workflow="image_flux.json"
            )
            # Video workflow
            media = await pixelle_video.media(
                prompt="a cat running",
                workflow="image_video.json"
            )
            if media.is_video:
                print(f"Video: {media.url}, duration: {media.duration}s")
            # With additional parameters
            media = await pixelle_video.media(
                prompt="a cat",
                workflow="image_flux.json",
                width=1024,
                height=1024,
                steps=20,
                seed=42
            )
            # With absolute path
            media = await pixelle_video.media(
                prompt="a cat",
                workflow="/path/to/custom.json"
            )
            # With custom ComfyUI server
            media = await pixelle_video.media(
                prompt="a cat",
                comfyui_url="http://192.168.1.100:8188"
            )
        """
        # 1. Resolve workflow (returns structured info)
        workflow_info = self._resolve_workflow(workflow=workflow)
        # 2. Prepare ComfyKit config (supports both selfhost and runninghub)
        kit_config = self._prepare_comfykit_config(
            comfyui_url=comfyui_url,
            runninghub_api_key=runninghub_api_key
        )
        # 3. Build workflow parameters
        workflow_params = {"prompt": prompt}
        # Add optional parameters
        if width is not None:
            workflow_params["width"] = width
        if height is not None:
            workflow_params["height"] = height
        if negative_prompt is not None:
            workflow_params["negative_prompt"] = negative_prompt
        if steps is not None:
            workflow_params["steps"] = steps
        if seed is not None:
            workflow_params["seed"] = seed
        if cfg is not None:
            workflow_params["cfg"] = cfg
        if sampler is not None:
            workflow_params["sampler"] = sampler
        # Add any additional parameters
        workflow_params.update(params)
        logger.debug(f"Workflow parameters: {workflow_params}")
        # 4. Execute workflow (ComfyKit auto-detects based on input type)
        try:
            kit = ComfyKit(**kit_config)
            # Determine what to pass to ComfyKit based on source
            if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
                # RunningHub: pass workflow_id (ComfyKit will use runninghub backend)
                workflow_input = workflow_info["workflow_id"]
                logger.info(f"Executing RunningHub workflow: {workflow_input}")
            else:
                # Selfhost: pass file path (ComfyKit will use local ComfyUI)
                workflow_input = workflow_info["path"]
                logger.info(f"Executing selfhost workflow: {workflow_input}")
            result = await kit.execute(workflow_input, workflow_params)
            # 5. Handle result based on specified media_type
            if result.status != "completed":
                error_msg = result.msg or "Unknown error"
                logger.error(f"Media generation failed: {error_msg}")
                raise Exception(f"Media generation failed: {error_msg}")
            # Extract media based on specified type
            if media_type == "video":
                # Video workflow - get video from result
                if not result.videos:
                    logger.error("No video generated (workflow returned no videos)")
                    raise Exception("No video generated")
                video_url = result.videos[0]
                logger.info(f"✅ Generated video: {video_url}")
                # Try to extract duration from result (if available)
                duration = None
                if hasattr(result, 'duration') and result.duration:
                    duration = result.duration
                return MediaResult(
                    media_type="video",
                    url=video_url,
                    duration=duration
                )
            else:  # image
                # Image workflow - get image from result
                if not result.images:
                    logger.error("No image generated (workflow returned no images)")
                    raise Exception("No image generated")
                image_url = result.images[0]
                logger.info(f"✅ Generated image: {image_url}")
                return MediaResult(
                    media_type="image",
                    url=image_url
                )
        except Exception as e:
            logger.error(f"Media generation error: {e}")
            raise
--- a/pixelle_video/services/video.py
+++ b/pixelle_video/services/video.py
@@ -239,6 +239,51 @@ class VideoService:
            logger.error(f"FFmpeg concat filter error: {error_msg}")
            raise RuntimeError(f"Failed to concatenate videos: {error_msg}")
    def _get_video_duration(self, video: str) -> float:
        """Get video duration in seconds"""
        try:
            probe = ffmpeg.probe(video)
            duration = float(probe['format']['duration'])
            return duration
        except Exception as e:
            logger.warning(f"Failed to get video duration: {e}")
            return 0.0
    def _get_audio_duration(self, audio: str) -> float:
        """Get audio duration in seconds"""
        try:
            probe = ffmpeg.probe(audio)
            duration = float(probe['format']['duration'])
            return duration
        except Exception as e:
            logger.warning(f"Failed to get audio duration: {e}, using estimate")
            # Fallback: estimate based on file size (very rough)
            import os
            file_size = os.path.getsize(audio)
            # Assume ~16kbps for MP3, so 2KB per second
            estimated_duration = file_size / 2000
            return max(1.0, estimated_duration)  # At least 1 second
    def has_audio_stream(self, video: str) -> bool:
        """
        Check if video has audio stream
        Args:
            video: Video file path
        Returns:
            True if video has audio stream, False otherwise
        """
        try:
            probe = ffmpeg.probe(video)
            audio_streams = [s for s in probe.get('streams', []) if s['codec_type'] == 'audio']
            has_audio = len(audio_streams) > 0
            logger.debug(f"Video {video} has_audio={has_audio}")
            return has_audio
        except Exception as e:
            logger.warning(f"Failed to probe video audio streams: {e}, assuming no audio")
            return False
    def merge_audio_video(
        self,
        video: str,
@@ -247,9 +292,18 @@ class VideoService:
        replace_audio: bool = True,
        audio_volume: float = 1.0,
        video_volume: float = 0.0,
        pad_strategy: str = "freeze",  # "freeze" (freeze last frame) or "black" (black screen)
    ) -> str:
        """
-        Merge audio with video
+        Merge audio with video, using the longer duration
        The output video duration will be the maximum of video and audio duration.
        If audio is longer than video, the video will be padded using the specified strategy.
        Automatically handles videos with or without audio streams.
        - If video has no audio: adds the audio track
        - If video has audio and replace_audio=True: replaces with new audio
        - If video has audio and replace_audio=False: mixes both audio tracks
        Args:
            video: Video file path
@@ -259,6 +313,9 @@ class VideoService:
            audio_volume: Volume of the new audio (0.0 to 1.0+)
            video_volume: Volume of original video audio (0.0 to 1.0+)
                         Only used when replace_audio=False
            pad_strategy: Strategy to pad video if audio is longer
                         - "freeze": Freeze last frame (default)
                         - "black": Fill with black screen
        Returns:
            Path to the output video file
@@ -267,28 +324,110 @@ class VideoService:
            RuntimeError: If FFmpeg execution fails
        Note:
-            - When replace_audio=True, video's original audio is removed
+            - Uses the longer duration between video and audio
-            - When replace_audio=False, original and new audio are mixed
+            - When audio is longer, video is padded using pad_strategy
-            - Audio is trimmed/extended to match video duration
+            - When video is longer, audio is looped or extended
            - Automatically detects if video has audio
            - When video is silent, audio is added regardless of replace_audio
            - When replace_audio=True and video has audio, original audio is removed
            - When replace_audio=False and video has audio, original and new audio are mixed
        """
        # Get durations of video and audio
        video_duration = self._get_video_duration(video)
        audio_duration = self._get_audio_duration(audio)
        logger.info(f"Video duration: {video_duration:.2f}s, Audio duration: {audio_duration:.2f}s")
        # Determine target duration (max of both)
        target_duration = max(video_duration, audio_duration)
        logger.info(f"Target output duration: {target_duration:.2f}s")
        # Check if video has audio stream
        video_has_audio = self.has_audio_stream(video)
        # Prepare video stream (potentially with padding)
        input_video = ffmpeg.input(video)
        video_stream = input_video.video
        # Pad video if audio is longer
        if audio_duration > video_duration:
            pad_duration = audio_duration - video_duration
            logger.info(f"Audio is longer, padding video by {pad_duration:.2f}s using '{pad_strategy}' strategy")
            if pad_strategy == "freeze":
                # Freeze last frame: tpad filter
                video_stream = video_stream.filter('tpad', stop_mode='clone', stop_duration=pad_duration)
            else:  # black
                # Generate black frames for padding duration
                from pixelle_video.utils.os_util import get_temp_path
                import os
                # Get video properties
                probe = ffmpeg.probe(video)
                video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video')
                width = int(video_info['width'])
                height = int(video_info['height'])
                fps_str = video_info['r_frame_rate']
                fps_num, fps_den = map(int, fps_str.split('/'))
                fps = fps_num / fps_den if fps_den != 0 else 30
                # Create black video for padding
                black_video_path = get_temp_path(f"black_pad_{os.path.basename(output)}")
                black_input = ffmpeg.input(
                    f'color=c=black:s={width}x{height}:r={fps}',
                    f='lavfi',
                    t=pad_duration
                )
                # Concatenate original video with black padding
                video_stream = ffmpeg.concat(video_stream, black_input.video, v=1, a=0)
        # Prepare audio stream
        input_audio = ffmpeg.input(audio)
        audio_stream = input_audio.audio.filter('volume', audio_volume)
        if not video_has_audio:
            logger.info(f"Video has no audio stream, adding audio track")
            # Video is silent, just add the audio
            try:
                (
                    ffmpeg
                    .output(
                        video_stream,
                        audio_stream,
                        output,
                        vcodec='libx264',  # Re-encode video if padded
                        acodec='aac',
                        audio_bitrate='192k',
                        t=target_duration  # Trim to target duration
                    )
                    .overwrite_output()
                    .run(capture_stdout=True, capture_stderr=True)
                )
                logger.success(f"Audio added to silent video: {output}")
                return output
            except ffmpeg.Error as e:
                error_msg = e.stderr.decode() if e.stderr else str(e)
                logger.error(f"FFmpeg error adding audio to silent video: {error_msg}")
                raise RuntimeError(f"Failed to add audio to video: {error_msg}")
        # Video has audio, proceed with merging
        logger.info(f"Merging audio with video (replace={replace_audio})")
        try:
            input_video = ffmpeg.input(video)
            input_audio = ffmpeg.input(audio)
            if replace_audio:
                # Replace audio: use only new audio, ignore original
                (
                    ffmpeg
                    .output(
-                        input_video.video,
+                        video_stream,
-                        input_audio.audio.filter('volume', audio_volume),
+                        audio_stream,
                        output,
-                        vcodec='copy',
+                        vcodec='libx264',  # Re-encode video if padded
                        acodec='aac',
                        audio_bitrate='192k',
-                        shortest=None
+                        t=target_duration  # Trim to target duration
                    )
                    .overwrite_output()
                    .run(capture_stdout=True, capture_stderr=True)
@@ -298,22 +437,23 @@ class VideoService:
                mixed_audio = ffmpeg.filter(
                    [
                        input_video.audio.filter('volume', video_volume),
-                        input_audio.audio.filter('volume', audio_volume)
+                        audio_stream
                    ],
                    'amix',
                    inputs=2,
-                    duration='first'
+                    duration='longest'  # Use longest audio
                )
                (
                    ffmpeg
                    .output(
-                        input_video.video,
+                        video_stream,
                        mixed_audio,
                        output,
-                        vcodec='copy',
+                        vcodec='libx264',  # Re-encode video if padded
                        acodec='aac',
-                        audio_bitrate='192k'
+                        audio_bitrate='192k',
                        t=target_duration  # Trim to target duration
                    )
                    .overwrite_output()
                    .run(capture_stdout=True, capture_stderr=True)
@@ -326,6 +466,92 @@ class VideoService:
            logger.error(f"FFmpeg merge error: {error_msg}")
            raise RuntimeError(f"Failed to merge audio and video: {error_msg}")
    def overlay_image_on_video(
        self,
        video: str,
        overlay_image: str,
        output: str,
        scale_mode: str = "contain"
    ) -> str:
        """
        Overlay a transparent image on top of video
        Args:
            video: Base video file path
            overlay_image: Transparent overlay image path (e.g., rendered HTML with transparent background)
            output: Output video file path
            scale_mode: How to scale the base video to fit the overlay size
                - "contain": Scale video to fit within overlay dimensions (letterbox/pillarbox)
                - "cover": Scale video to cover overlay dimensions (may crop)
                - "stretch": Stretch video to exact overlay dimensions
        Returns:
            Path to the output video file
        Raises:
            RuntimeError: If FFmpeg execution fails
        Note:
            - Overlay image should have transparent background
            - Video is scaled to match overlay dimensions based on scale_mode
            - Final video size matches overlay image size
            - Video codec is re-encoded to support overlay
        """
        logger.info(f"Overlaying image on video (scale_mode={scale_mode})")
        try:
            # Get overlay image dimensions
            overlay_probe = ffmpeg.probe(overlay_image)
            overlay_stream = next(s for s in overlay_probe['streams'] if s['codec_type'] == 'video')
            overlay_width = int(overlay_stream['width'])
            overlay_height = int(overlay_stream['height'])
            logger.debug(f"Overlay dimensions: {overlay_width}x{overlay_height}")
            input_video = ffmpeg.input(video)
            input_overlay = ffmpeg.input(overlay_image)
            # Scale video to fit overlay size using scale_mode
            if scale_mode == "contain":
                # Scale to fit (letterbox/pillarbox if aspect ratio differs)
                # Use scale filter with force_original_aspect_ratio=decrease and pad to center
                scaled_video = (
                    input_video
                    .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='decrease')
                    .filter('pad', overlay_width, overlay_height, '(ow-iw)/2', '(oh-ih)/2', color='black')
                )
            elif scale_mode == "cover":
                # Scale to cover (crop if aspect ratio differs)
                scaled_video = (
                    input_video
                    .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='increase')
                    .filter('crop', overlay_width, overlay_height)
                )
            else:  # stretch
                # Stretch to exact dimensions
                scaled_video = input_video.filter('scale', overlay_width, overlay_height)
            # Overlay the transparent image on top of the scaled video
            output_stream = ffmpeg.overlay(scaled_video, input_overlay)
            (
                ffmpeg
                .output(output_stream, output, 
                        vcodec='libx264',
                        pix_fmt='yuv420p',
                        preset='medium',
                        crf=23)
                .overwrite_output()
                .run(capture_stdout=True, capture_stderr=True)
            )
            logger.success(f"Image overlaid on video: {output}")
            return output
        except ffmpeg.Error as e:
            error_msg = e.stderr.decode() if e.stderr else str(e)
            logger.error(f"FFmpeg overlay error: {error_msg}")
            raise RuntimeError(f"Failed to overlay image on video: {error_msg}")
    def create_video_from_image(
        self,
        image: str,
--- a/pixelle_video/utils/content_generators.py
+++ b/pixelle_video/utils/content_generators.py
@@ -321,6 +321,98 @@ async def generate_image_prompts(
    return all_prompts
 async def generate_video_prompts(
    llm_service,
    narrations: List[str],
    min_words: int = 30,
    max_words: int = 60,
    batch_size: int = 10,
    max_retries: int = 3,
    progress_callback: Optional[callable] = None
 ) -> List[str]:
    """
    Generate video prompts from narrations (with batching and retry)
    Args:
        llm_service: LLM service instance
        narrations: List of narrations
        min_words: Min video prompt length
        max_words: Max video prompt length
        batch_size: Max narrations per batch (default: 10)
        max_retries: Max retry attempts per batch (default: 3)
        progress_callback: Optional callback(completed, total, message) for progress updates
    Returns:
        List of video prompts (base prompts, without prefix applied)
    """
    from pixelle_video.prompts.video_generation import build_video_prompt_prompt
    logger.info(f"Generating video prompts for {len(narrations)} narrations (batch_size={batch_size})")
    # Split narrations into batches
    batches = [narrations[i:i + batch_size] for i in range(0, len(narrations), batch_size)]
    logger.info(f"Split into {len(batches)} batches")
    all_prompts = []
    # Process each batch
    for batch_idx, batch_narrations in enumerate(batches, 1):
        logger.info(f"Processing batch {batch_idx}/{len(batches)} ({len(batch_narrations)} narrations)")
        # Retry logic for this batch
        for attempt in range(1, max_retries + 1):
            try:
                # Generate prompts for this batch
                prompt = build_video_prompt_prompt(
                    narrations=batch_narrations,
                    min_words=min_words,
                    max_words=max_words
                )
                response = await llm_service(
                    prompt=prompt,
                    temperature=0.7,
                    max_tokens=8192
                )
                logger.debug(f"Batch {batch_idx} attempt {attempt}: LLM response length: {len(response)} chars")
                # Parse JSON
                result = _parse_json(response)
                if "video_prompts" not in result:
                    raise KeyError("Invalid response format: missing 'video_prompts'")
                batch_prompts = result["video_prompts"]
                # Validate batch result
                if len(batch_prompts) != len(batch_narrations):
                    raise ValueError(
                        f"Prompt count mismatch: expected {len(batch_narrations)}, got {len(batch_prompts)}"
                    )
                # Success - add to all_prompts
                all_prompts.extend(batch_prompts)
                logger.info(f"✓ Batch {batch_idx} completed: {len(batch_prompts)} video prompts")
                # Report progress
                if progress_callback:
                    completed = len(all_prompts)
                    total = len(narrations)
                    progress_callback(completed, total, f"Batch {batch_idx}/{len(batches)} completed")
                break  # Success, move to next batch
            except Exception as e:
                logger.warning(f"✗ Batch {batch_idx} attempt {attempt} failed: {e}")
                if attempt >= max_retries:
                    raise
                logger.info(f"Retrying batch {batch_idx}...")
    logger.info(f"✅ Generated {len(all_prompts)} video prompts")
    return all_prompts
 def _parse_json(text: str) -> dict:
    """
    Parse JSON from text, with fallback to extract JSON from markdown code blocks
--- a/pixelle_video/utils/os_util.py
+++ b/pixelle_video/utils/os_util.py
@@ -260,7 +260,7 @@ def get_task_path(task_id: str, *paths: str) -> str:
 def get_task_frame_path(
    task_id: str, 
    frame_index: int, 
-    file_type: Literal["audio", "image", "composed", "segment"]
+    file_type: Literal["audio", "image", "video", "composed", "segment"]
 ) -> str:
    """
    Get frame file path within task directory
@@ -268,7 +268,7 @@ def get_task_frame_path(
    Args:
        task_id: Task ID
        frame_index: Frame index (0-based internally, but filename starts from 01)
-        file_type: File type (audio/image/composed/segment)
+        file_type: File type (audio/image/video/composed/segment)
    Returns:
        Absolute path to frame file
@@ -280,6 +280,7 @@ def get_task_frame_path(
    ext_map = {
        "audio": "mp3",
        "image": "png",
        "video": "mp4",
        "composed": "png",
        "segment": "mp4"
    }
--- a/web/app.py
+++ b/web/app.py
@@ -782,10 +782,29 @@ def main():
            generator_for_params = HTMLFrameGenerator(template_path_for_params)
            custom_params_for_video = generator_for_params.parse_template_parameters()
-            # Detect if template requires image generation
+            # Detect template media type
-            template_requires_image = generator_for_params.requires_image()
+            from pathlib import Path
-            # Store in session state for Image Section to read
+            template_name = Path(frame_template).name
-            st.session_state['template_requires_image'] = template_requires_image
+            
            if template_name.startswith("video_"):
                # Video template
                template_media_type = "video"
                template_requires_media = True
            elif generator_for_params.requires_image():
                # Image template
                template_media_type = "image"
                template_requires_media = True
            else:
                # Text-only template
                template_media_type = "text"
                template_requires_media = False
            # Store in session state for workflow filtering
            st.session_state['template_media_type'] = template_media_type
            st.session_state['template_requires_media'] = template_requires_media
            # Backward compatibility
            st.session_state['template_requires_image'] = (template_media_type == "image")
            custom_values_for_video = {}
            if custom_params_for_video:
@@ -928,25 +947,51 @@ def main():
                            logger.exception(e)
        # ====================================================================
-        # Image Generation Section (conditional based on template)
+        # Media Generation Section (conditional based on template)
        # ====================================================================
-        # Check if current template requires image generation
+        # Check if current template requires media generation
-        if st.session_state.get('template_requires_image', True):
+        template_media_type = st.session_state.get('template_media_type', 'image')
-            # Template requires images - show full Image Section
+        template_requires_media = st.session_state.get('template_requires_media', True)
        if template_requires_media:
            # Template requires media - show Media Generation Section
            with st.container(border=True):
-                st.markdown(f"**{tr('section.image')}**")
+                # Dynamic section title based on template type
                if template_media_type == "video":
                    section_title = tr('section.video')
                else:
                    section_title = tr('section.image')
                st.markdown(f"**{section_title}**")
                # 1. ComfyUI Workflow selection
                with st.expander(tr("help.feature_description"), expanded=False):
                    st.markdown(f"**{tr('help.what')}**")
                    if template_media_type == "video":
                        st.markdown(tr('style.video_workflow_what'))
                    else:
                        st.markdown(tr("style.workflow_what"))
                    st.markdown(f"**{tr('help.how')}**")
                    if template_media_type == "video":
                        st.markdown(tr('style.video_workflow_how'))
                    else:
                        st.markdown(tr("style.workflow_how"))
                    st.markdown(f"**{tr('help.note')}**")
                    if template_media_type == "video":
                        st.markdown(tr('style.video_size_note'))
                    else:
                        st.markdown(tr("style.image_size_note"))
-                # Get available workflows from pixelle_video (with source info)
+                # Get available workflows and filter by template type
-                workflows = pixelle_video.image.list_workflows()
+                all_workflows = pixelle_video.media.list_workflows()
                # Filter workflows based on template media type
                if template_media_type == "video":
                    # Only show video_ workflows
                    workflows = [wf for wf in all_workflows if "video_" in wf["key"].lower()]
                else:
                    # Only show image_ workflows (exclude video_)
                    workflows = [wf for wf in all_workflows if "video_" not in wf["key"].lower()]
                # Build options for selectbox
                # Display: "image_flux.json - Runninghub"
@@ -979,25 +1024,39 @@ def main():
                    workflow_key = "runninghub/image_flux.json"  # fallback
-                # 2. Image size input
+                # 2. Media size input
                col1, col2 = st.columns(2)
                with col1:
                    if template_media_type == "video":
                        width_label = tr('style.video_width')
                        width_help = tr('style.video_width_help')
                    else:
                        width_label = tr('style.image_width')
                        width_help = tr('style.image_width_help')
                    image_width = st.number_input(
-                        tr('style.image_width'),
+                        width_label,
                        min_value=128,
                        value=1024,
                        step=1,
                        label_visibility="visible",
-                        help=tr('style.image_width_help')
+                        help=width_help
                    )
                with col2:
                    if template_media_type == "video":
                        height_label = tr('style.video_height')
                        height_help = tr('style.video_height_help')
                    else:
                        height_label = tr('style.image_height')
                        height_help = tr('style.image_height_help')
                    image_height = st.number_input(
-                        tr('style.image_height'),
+                        height_label,
                        min_value=128,
                        value=1024,
                        step=1,
                        label_visibility="visible",
-                        help=tr('style.image_height_help')
+                        help=height_help
                    )
                # 3. Prompt prefix input
@@ -1014,44 +1073,61 @@ def main():
                    help=tr("style.prompt_prefix_help")
                )
-                # Style preview expander (similar to template preview)
+                # Media preview expander
-                with st.expander(tr("style.preview_title"), expanded=False):
+                preview_title = tr("style.video_preview_title") if template_media_type == "video" else tr("style.preview_title")
                with st.expander(preview_title, expanded=False):
                    # Test prompt input
                    if template_media_type == "video":
                        test_prompt_label = tr("style.test_video_prompt")
                        test_prompt_value = "a dog running in the park"
                    else:
                        test_prompt_label = tr("style.test_prompt")
                        test_prompt_value = "a dog"
                    test_prompt = st.text_input(
-                        tr("style.test_prompt"),
+                        test_prompt_label,
-                        value="a dog",
+                        value=test_prompt_value,
                        help=tr("style.test_prompt_help"),
                        key="style_test_prompt"
                    )
                    # Preview button
-                    if st.button(tr("style.preview"), key="preview_style", use_container_width=True):
+                    preview_button_label = tr("style.video_preview") if template_media_type == "video" else tr("style.preview")
-                        with st.spinner(tr("style.previewing")):
+                    if st.button(preview_button_label, key="preview_style", use_container_width=True):
                        previewing_text = tr("style.video_previewing") if template_media_type == "video" else tr("style.previewing")
                        with st.spinner(previewing_text):
                            try:
                                from pixelle_video.utils.prompt_helper import build_image_prompt
                                # Build final prompt with prefix
                                final_prompt = build_image_prompt(test_prompt, prompt_prefix)
-                                # Generate preview image (use user-specified size)
+                                # Generate preview media (use user-specified size and media type)
-                                preview_image_path = run_async(pixelle_video.image(
+                                media_result = run_async(pixelle_video.media(
                                    prompt=final_prompt,
                                    workflow=workflow_key,
                                    media_type=template_media_type,
                                    width=int(image_width),
                                    height=int(image_height)
                                ))
                                preview_media_path = media_result.url
                                # Display preview (support both URL and local path)
-                                if preview_image_path:
+                                if preview_media_path:
-                                    st.success(tr("style.preview_success"))
+                                    success_text = tr("style.video_preview_success") if template_media_type == "video" else tr("style.preview_success")
                                    st.success(success_text)
-                                    # Read and encode image
+                                    if template_media_type == "video":
-                                    if preview_image_path.startswith('http'):
+                                        # Display video
                                        st.video(preview_media_path)
                                    else:
                                        # Display image
                                        if preview_media_path.startswith('http'):
                                            # URL - use directly
-                                        img_html = f'<div class="preview-image"><img src="{preview_image_path}" alt="Style Preview"/></div>'
+                                            img_html = f'<div class="preview-image"><img src="{preview_media_path}" alt="Style Preview"/></div>'
                                        else:
                                            # Local file - encode as base64
-                                        with open(preview_image_path, 'rb') as f:
+                                            with open(preview_media_path, 'rb') as f:
                                                img_data = base64.b64encode(f.read()).decode()
                                            img_html = f'<div class="preview-image"><img src="data:image/png;base64,{img_data}" alt="Style Preview"/></div>'
@@ -1061,7 +1137,7 @@ def main():
                                    st.info(f"**{tr('style.final_prompt_label')}**\n{final_prompt}")
                                    # Show file path
-                                    st.caption(f"📁 {preview_image_path}")
+                                    st.caption(f"📁 {preview_media_path}")
                                else:
                                    st.error(tr("style.preview_failed_general"))
                            except Exception as e:
--- a/web/i18n/locales/en_US.json
+++ b/web/i18n/locales/en_US.json
@@ -8,6 +8,8 @@
    "section.bgm": "🎵 Background Music",
    "section.tts": "🎤 Voiceover",
    "section.image": "🎨 Image Generation",
    "section.video": "🎬 Video Generation",
    "section.media": "🎨 Media Generation",
    "section.template": "📐 Storyboard Template",
    "section.video_generation": "🎬 Generate Video",
@@ -45,12 +47,19 @@
    "style.workflow": "Workflow Selection",
    "style.workflow_what": "Determines how each frame's illustration is generated and its effect (e.g., using FLUX, SD models)",
    "style.workflow_how": "Place the exported image_xxx.json workflow file(API format) into the workflows/selfhost/ folder (for local ComfyUI) or the workflows/runninghub/ folder (for cloud)",
    "style.video_workflow_what": "Determines how each frame's video clip is generated and its effect (e.g., using different video generation models)",
    "style.video_workflow_how": "Place the exported video_xxx.json workflow file(API format) into the workflows/selfhost/ folder (for local ComfyUI) or the workflows/runninghub/ folder (for cloud)",
    "style.image_size": "Image Size",
    "style.image_width": "Width",
    "style.image_height": "Height",
    "style.image_width_help": "Width of AI-generated images (Note: This is the image size, not the final video size. Video size is determined by the template)",
    "style.image_height_help": "Height of AI-generated images (Note: This is the image size, not the final video size. Video size is determined by the template)",
    "style.video_width": "Video Width",
    "style.video_height": "Video Height",
    "style.video_width_help": "Width of AI-generated video (Note: This is the video clip size, will auto-adapt to template size)",
    "style.video_height_help": "Height of AI-generated video (Note: This is the video clip size, will auto-adapt to template size)",
    "style.image_size_note": "Image size controls the dimensions of AI-generated illustrations, and does not affect the final video size. Video size is determined by the Storyboard Template below.",
    "style.video_size_note": "Video size will automatically adapt to the template size, no manual adjustment needed.",
    "style.prompt_prefix": "Prompt Prefix",
    "style.prompt_prefix_what": "Automatically added before all image prompts to control the illustration style uniformly (e.g., cartoon, realistic)",
    "style.prompt_prefix_how": "Enter style description in the input box below. To save permanently, edit the config.yaml file",
@@ -60,11 +69,16 @@
    "style.description": "Style Description",
    "style.description_placeholder": "Describe the illustration style you want (any language)...",
    "style.preview_title": "Preview Style",
    "style.video_preview_title": "Preview Video",
    "style.test_prompt": "Test Prompt",
    "style.test_video_prompt": "Test Video Prompt",
    "style.test_prompt_help": "Enter test prompt to preview style effect",
    "style.preview": "🖼️ Generate Preview",
    "style.video_preview": "🎬 Generate Video Preview",
    "style.previewing": "Generating style preview...",
    "style.video_previewing": "Generating video preview...",
    "style.preview_success": "✅ Preview generated successfully!",
    "style.video_preview_success": "✅ Video preview generated successfully!",
    "style.preview_caption": "Style Preview",
    "style.preview_failed": "Preview failed: {error}",
    "style.preview_failed_general": "Failed to generate preview image",
@@ -140,12 +154,16 @@
    "progress.generating_narrations": "Generating narrations...",
    "progress.splitting_script": "Splitting script...",
    "progress.generating_image_prompts": "Generating image prompts...",
    "progress.generating_video_prompts": "Generating video prompts...",
    "progress.preparing_frames": "Preparing frames...",
    "progress.frame": "Frame {current}/{total}",
    "progress.frame_step": "Frame {current}/{total} - Step {step}/4: {action}",
-    "progress.step_audio": "Generating audio...",
+    "progress.processing_frame": "Processing frame {current}/{total}...",
-    "progress.step_image": "Generating image...",
+    "progress.step_audio": "Generating audio",
-    "progress.step_compose": "Composing frame...",
+    "progress.step_image": "Generating image",
-    "progress.step_video": "Creating video segment...",
+    "progress.step_media": "Generating media",
    "progress.step_compose": "Composing frame",
    "progress.step_video": "Creating video segment",
    "progress.concatenating": "Concatenating video...",
    "progress.finalizing": "Finalizing...",
    "progress.completed": "✅ Completed",
--- a/web/i18n/locales/zh_CN.json
+++ b/web/i18n/locales/zh_CN.json
@@ -8,6 +8,8 @@
    "section.bgm": "🎵 背景音乐",
    "section.tts": "🎤 配音合成",
    "section.image": "🎨 插图生成",
    "section.video": "🎬 视频生成",
    "section.media": "🎨 媒体生成",
    "section.template": "📐 分镜模板",
    "section.video_generation": "🎬 生成视频",
@@ -45,12 +47,19 @@
    "style.workflow": "工作流选择",
    "style.workflow_what": "决定视频中每帧插图的生成方式和效果（如使用 FLUX、SD 等模型）",
    "style.workflow_how": "将导出的 image_xxx.json 工作流文件（API格式）放入 workflows/selfhost/（本地 ComfyUI）或 workflows/runninghub/（云端）文件夹",
    "style.video_workflow_what": "决定视频中每帧视频片段的生成方式和效果（如使用不同的视频生成模型）",
    "style.video_workflow_how": "将导出的 video_xxx.json 工作流文件（API格式）放入 workflows/selfhost/（本地 ComfyUI）或 workflows/runninghub/（云端）文件夹",
    "style.image_size": "图片尺寸",
    "style.image_width": "宽度",
    "style.image_height": "高度",
    "style.image_width_help": "AI 生成插图的宽度（注意：这是插图尺寸，不是最终视频尺寸。视频尺寸由模板决定）",
    "style.image_height_help": "AI 生成插图的高度（注意：这是插图尺寸，不是最终视频尺寸。视频尺寸由模板决定）",
    "style.video_width": "视频宽度",
    "style.video_height": "视频高度",
    "style.video_width_help": "AI 生成视频的宽度（注意：这是视频片段尺寸，会自适应模板尺寸）",
    "style.video_height_help": "AI 生成视频的高度（注意：这是视频片段尺寸，会自适应模板尺寸）",
    "style.image_size_note": "图片尺寸控制 AI 生成的插图大小，不影响最终视频尺寸。视频尺寸由下方的「📐 分镜模板」决定。",
    "style.video_size_note": "视频尺寸会自动适配模板尺寸，无需手动调整。",
    "style.prompt_prefix": "提示词前缀",
    "style.prompt_prefix_what": "自动添加到所有图片提示词前面，统一控制插图风格（如：卡通风格、写实风格等）",
    "style.prompt_prefix_how": "直接在下方输入框填写风格描述。若要永久保存，需编辑 config.yaml 文件",
@@ -60,11 +69,16 @@
    "style.description": "风格描述",
    "style.description_placeholder": "描述您想要的插图风格（任何语言）...",
    "style.preview_title": "预览风格",
    "style.video_preview_title": "预览视频",
    "style.test_prompt": "测试提示词",
    "style.test_video_prompt": "测试视频提示词",
    "style.test_prompt_help": "输入测试提示词来预览风格效果",
    "style.preview": "🖼️ 生成预览",
    "style.video_preview": "🎬 生成视频预览",
    "style.previewing": "正在生成风格预览...",
    "style.video_previewing": "正在生成视频预览...",
    "style.preview_success": "✅ 预览生成成功！",
    "style.video_preview_success": "✅ 视频预览生成成功！",
    "style.preview_caption": "风格预览",
    "style.preview_failed": "预览失败：{error}",
    "style.preview_failed_general": "预览图片生成失败",
@@ -140,12 +154,16 @@
    "progress.generating_narrations": "生成旁白...",
    "progress.splitting_script": "切分脚本...",
    "progress.generating_image_prompts": "生成图片提示词...",
    "progress.generating_video_prompts": "生成视频提示词...",
    "progress.preparing_frames": "准备分镜...",
    "progress.frame": "分镜 {current}/{total}",
    "progress.frame_step": "分镜 {current}/{total} - 步骤 {step}/4: {action}",
-    "progress.step_audio": "生成语音...",
+    "progress.processing_frame": "处理分镜 {current}/{total}...",
-    "progress.step_image": "生成插图...",
+    "progress.step_audio": "生成语音",
-    "progress.step_compose": "合成画面...",
+    "progress.step_image": "生成插图",
-    "progress.step_video": "创建视频片段...",
+    "progress.step_media": "生成媒体",
    "progress.step_compose": "合成画面",
    "progress.step_video": "创建视频片段",
    "progress.concatenating": "正在拼接视频...",
    "progress.finalizing": "完成中...",
    "progress.completed": "✅ 生成完成",