支持基于图片素材的视频生成逻辑

2025-12-03 20:11:32 +08:00
parent 6e99612a68
commit ea784e0d06
9 changed files with 1180 additions and 40 deletions
--- a/pixelle_video/pipelines/init.py
+++ b/pixelle_video/pipelines/init.py
@@ -21,6 +21,7 @@ from pixelle_video.pipelines.base import BasePipeline
 from pixelle_video.pipelines.linear import LinearVideoPipeline, PipelineContext
 from pixelle_video.pipelines.standard import StandardPipeline
 from pixelle_video.pipelines.custom import CustomPipeline
+from pixelle_video.pipelines.asset_based import AssetBasedPipeline

 __all__ = [
    "BasePipeline",
@@ -28,5 +29,6 @@ __all__ = [
    "PipelineContext",
    "StandardPipeline",
    "CustomPipeline",
+    "AssetBasedPipeline",
 ]

--- a/pixelle_video/pipelines/asset_based.py
+++ b/pixelle_video/pipelines/asset_based.py
@@ -0,0 +1,661 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Asset-Based Video Pipeline
+
+Generates marketing videos from user-provided assets (images/videos) rather than
+AI-generated media. Ideal for small businesses with existing media libraries.
+
+Workflow:
+1. Analyze uploaded assets (images/videos)
+2. Generate script based on user intent and available assets
+3. Match assets to script scenes
+4. Compose final video with narrations
+
+Example:
+    pipeline = AssetBasedPipeline(pixelle_video)
+    result = await pipeline(
+        assets=["/path/img1.jpg", "/path/img2.jpg"],
+        video_title="Pet Store Year-End Sale",
+        style="warm and friendly",
+        duration=30
+    )
+"""
+
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+from loguru import logger
+from pydantic import BaseModel, Field
+
+from pixelle_video.pipelines.linear import LinearVideoPipeline, PipelineContext
+from pixelle_video.utils.os_util import (
+    create_task_output_dir,
+    get_task_final_video_path
+)
+
+
+# ==================== Structured Output Models ====================
+
+class SceneScript(BaseModel):
+    """Single scene in the video script"""
+    scene_number: int = Field(description="Scene number starting from 1")
+    asset_path: str = Field(description="Path to the asset file for this scene")
+    narrations: List[str] = Field(description="List of narration sentences for this scene (1-5 sentences)")
+    duration: int = Field(description="Estimated duration in seconds for this scene")
+
+
+class VideoScript(BaseModel):
+    """Complete video script with scenes"""
+    scenes: List[SceneScript] = Field(description="List of scenes in the video")
+
+
+class AssetBasedPipeline(LinearVideoPipeline):
+    """
+    Asset-Based Video Pipeline
+    
+    Generates videos from user-provided assets instead of AI-generated media.
+    """
+    
+    def __init__(self, core):
+        """
+        Initialize pipeline
+        
+        Args:
+            core: PixelleVideoCore instance
+        """
+        super().__init__(core)
+        self.asset_index: Dict[str, Any] = {}  # In-memory asset metadata
+    
+    async def __call__(
+        self,
+        assets: List[str],
+        video_title: str = "",
+        intent: Optional[str] = None,
+        style: str = "professional and engaging",
+        duration: int = 30,
+        source: str = "runninghub",
+        bgm_path: Optional[str] = None,
+        bgm_volume: float = 0.2,
+        bgm_mode: str = "loop",
+        **kwargs
+    ) -> PipelineContext:
+        """
+        Execute pipeline with user-provided assets
+        
+        Args:
+            assets: List of asset file paths
+            video_title: Video title
+            intent: Video intent/purpose (defaults to video_title)
+            style: Video style
+            duration: Target duration in seconds
+            source: Workflow source ("runninghub" or "selfhost")
+            bgm_path: Path to background music file (optional)
+            bgm_volume: BGM volume (0.0-1.0, default 0.2)
+            bgm_mode: BGM mode ("loop" or "once", default "loop")
+            **kwargs: Additional parameters
+        
+        Returns:
+            Pipeline context with generated video
+        """
+        from pixelle_video.pipelines.linear import PipelineContext
+        
+        # Create custom context with asset-specific parameters
+        ctx = PipelineContext(
+            input_text=intent or video_title,  # Use intent or title as input_text
+            params={
+                "assets": assets,
+                "video_title": video_title,
+                "intent": intent or video_title,
+                "style": style,
+                "duration": duration,
+                "source": source,
+                "bgm_path": bgm_path,
+                "bgm_volume": bgm_volume,
+                "bgm_mode": bgm_mode,
+                **kwargs
+            }
+        )
+        
+        # Store request parameters in context for easy access
+        ctx.request = ctx.params
+        
+        try:
+            # Execute pipeline lifecycle
+            await self.setup_environment(ctx)
+            await self.determine_title(ctx)
+            await self.generate_content(ctx)
+            await self.plan_visuals(ctx)
+            await self.initialize_storyboard(ctx)
+            await self.produce_assets(ctx)
+            await self.post_production(ctx)
+            await self.finalize(ctx)
+            
+            return ctx
+            
+        except Exception as e:
+            await self.handle_exception(ctx, e)
+            raise
+    
+    async def setup_environment(self, context: PipelineContext) -> PipelineContext:
+        """
+        Analyze uploaded assets and build asset index
+        
+        Args:
+            context: Pipeline context with assets list
+        
+        Returns:
+            Updated context with asset_index
+        """
+        # Create isolated task directory
+        task_dir, task_id = create_task_output_dir()
+        context.task_id = task_id
+        context.task_dir = Path(task_dir)  # Convert to Path for easier usage
+        
+        # Determine final video path
+        context.final_video_path = get_task_final_video_path(task_id)
+        
+        logger.info(f"📁 Task directory created: {task_dir}")
+        logger.info("🔍 Analyzing uploaded assets...")
+        
+        assets: List[str] = context.request.get("assets", [])
+        if not assets:
+            raise ValueError("No assets provided. Please upload at least one image or video.")
+        
+        logger.info(f"Found {len(assets)} assets to analyze")
+        
+        self.asset_index = {}
+        
+        for i, asset_path in enumerate(assets, 1):
+            asset_path_obj = Path(asset_path)
+            
+            if not asset_path_obj.exists():
+                logger.warning(f"Asset not found: {asset_path}")
+                continue
+            
+            logger.info(f"Analyzing asset {i}/{len(assets)}: {asset_path_obj.name}")
+            
+            # Determine asset type
+            asset_type = self._get_asset_type(asset_path_obj)
+            
+            if asset_type == "image":
+                # Analyze image using ImageAnalysisService
+                analysis_source = context.request.get("source", "runninghub")
+                description = await self.core.image_analysis(asset_path, source=analysis_source)
+                
+                self.asset_index[asset_path] = {
+                    "path": asset_path,
+                    "type": "image",
+                    "name": asset_path_obj.name,
+                    "description": description
+                }
+                
+                logger.info(f"✅ Image analyzed: {description[:50]}...")
+            
+            elif asset_type == "video":
+                # TODO: Extract keyframes and analyze
+                # For MVP, we'll skip video analysis and just record metadata
+                self.asset_index[asset_path] = {
+                    "path": asset_path,
+                    "type": "video",
+                    "name": asset_path_obj.name,
+                    "description": "Video asset"
+                }
+                
+                logger.info(f"⏭️ Video registered (analysis not yet implemented)")
+            
+            else:
+                logger.warning(f"Unknown asset type: {asset_path}")
+        
+        logger.success(f"✅ Asset analysis complete: {len(self.asset_index)} assets indexed")
+        
+        # Store asset index in context
+        context.asset_index = self.asset_index
+        
+        return context
+    
+    async def determine_title(self, context: PipelineContext) -> PipelineContext:
+        """
+        Use user-provided title or generate one via LLM
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Updated context with title
+        """
+        from pixelle_video.utils.content_generators import generate_title
+        
+        title = context.request.get("video_title")
+        
+        if title:
+            context.title = title
+            logger.info(f"📝 Video title: {title} (user-specified)")
+        else:
+            # Generate title from intent using LLM
+            intent = context.request.get("intent", context.input_text)
+            context.title = await generate_title(
+                self.core.llm,
+                content=intent,
+                strategy="llm"
+            )
+            logger.info(f"📝 Video title: {context.title} (LLM-generated)")
+        
+        return context
+    
+    async def generate_content(self, context: PipelineContext) -> PipelineContext:
+        """
+        Generate video script using LLM with structured output
+        
+        LLM directly assigns assets to scenes - no complex matching logic needed.
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Updated context with generated script (scenes already have asset_path assigned)
+        """
+        logger.info("🤖 Generating video script with LLM...")
+        
+        # Build prompt for LLM
+        intent = context.request.get("intent", context.title)
+        style = context.request.get("style", "professional and engaging")
+        duration = context.request.get("duration", 30)
+        
+        # Prepare asset descriptions with full paths for LLM to reference
+        asset_info = []
+        for asset_path, metadata in self.asset_index.items():
+            asset_info.append(f"- Path: {asset_path}\n  Description: {metadata['description']}")
+        
+        assets_text = "\n".join(asset_info)
+        
+        prompt = f"""You are a video script writer. Generate a {duration}-second video script.
+
+## Requirements
+- Intent: {intent}
+- Style: {style}
+- Target Duration: {duration} seconds
+
+## Available Assets (use the exact path in your response)
+{assets_text}
+
+## Instructions
+1. Decide how many scenes are needed based on the target duration (typically 5-15 seconds per scene)
+2. For each scene, directly assign ONE asset from the available assets above
+3. Each scene can have 1-5 narration sentences
+4. Try to use all available assets, but it's OK to reuse if needed
+5. Total duration of all scenes should be approximately {duration} seconds
+
+## Output Requirements
+For each scene, provide:
+- scene_number: Sequential number starting from 1
+- asset_path: The EXACT path from the available assets list
+- narrations: Array of 1-5 narration sentences
+- duration: Estimated duration in seconds
+
+Generate the video script now:"""
+        
+        # Call LLM with structured output
+        script: VideoScript = await self.core.llm(
+            prompt=prompt,
+            response_type=VideoScript,
+            temperature=0.8,
+            max_tokens=4000
+        )
+        
+        # Convert to dict format for compatibility with downstream code
+        context.script = [scene.model_dump() for scene in script.scenes]
+        
+        # Validate asset paths exist
+        for scene in context.script:
+            asset_path = scene.get("asset_path")
+            if asset_path not in self.asset_index:
+                # Find closest match (in case LLM slightly modified the path)
+                matched = False
+                for known_path in self.asset_index.keys():
+                    if Path(known_path).name == Path(asset_path).name:
+                        scene["asset_path"] = known_path
+                        matched = True
+                        logger.warning(f"Corrected asset path: {asset_path} -> {known_path}")
+                        break
+                
+                if not matched:
+                    # Fallback to first available asset
+                    fallback_path = list(self.asset_index.keys())[0]
+                    logger.warning(f"Unknown asset path '{asset_path}', using fallback: {fallback_path}")
+                    scene["asset_path"] = fallback_path
+        
+        logger.success(f"✅ Generated script with {len(context.script)} scenes")
+        
+        # Log script preview
+        for scene in context.script:
+            narrations = scene.get("narrations", [])
+            if isinstance(narrations, str):
+                narrations = [narrations]
+            narration_preview = " | ".join([n[:30] + "..." if len(n) > 30 else n for n in narrations[:2]])
+            asset_name = Path(scene.get("asset_path", "unknown")).name
+            logger.info(f"Scene {scene['scene_number']} [{asset_name}]: {narration_preview}")
+        
+        return context
+    
+    async def plan_visuals(self, context: PipelineContext) -> PipelineContext:
+        """
+        Prepare matched scenes from LLM-generated script
+        
+        Since LLM already assigned asset_path in generate_content, this method
+        simply converts the script format to matched_scenes format.
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Updated context with matched_scenes
+        """
+        logger.info("🎯 Preparing scene-asset mapping...")
+        
+        # LLM already assigned asset_path to each scene in generate_content
+        # Just convert to matched_scenes format for downstream compatibility
+        context.matched_scenes = [
+            {
+                **scene,
+                "matched_asset": scene["asset_path"]  # Alias for compatibility
+            }
+            for scene in context.script
+        ]
+        
+        # Log asset usage summary
+        asset_usage = {}
+        for scene in context.matched_scenes:
+            asset = scene["matched_asset"]
+            asset_usage[asset] = asset_usage.get(asset, 0) + 1
+        
+        logger.info(f"📊 Asset usage summary:")
+        for asset_path, count in asset_usage.items():
+            logger.info(f"   {Path(asset_path).name}: {count} scene(s)")
+        
+        return context
+    
+    async def initialize_storyboard(self, context: PipelineContext) -> PipelineContext:
+        """
+        Initialize storyboard from matched scenes
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Updated context with storyboard
+        """
+        from pixelle_video.models.storyboard import (
+            Storyboard,
+            StoryboardFrame, 
+            StoryboardConfig
+        )
+        from datetime import datetime
+        
+        # Extract all narrations in order for compatibility
+        all_narrations = []
+        for scene in context.matched_scenes:
+            narrations = scene.get("narrations", [scene.get("narration", "")])
+            if isinstance(narrations, str):
+                narrations = [narrations]
+            all_narrations.extend(narrations)
+        
+        context.narrations = all_narrations
+        
+        # Get template dimensions
+        template_name = context.params.get("frame_template", "1080x1920/image_default.html")
+        # Extract dimensions from template name (e.g., "1080x1920")
+        try:
+            dims = template_name.split("/")[0].split("x")
+            media_width = int(dims[0])
+            media_height = int(dims[1])
+        except:
+            # Default to 1080x1920
+            media_width = 1080
+            media_height = 1920
+        
+        # Create StoryboardConfig
+        context.config = StoryboardConfig(
+            task_id=context.task_id,
+            n_storyboard=len(context.matched_scenes),  # Number of scenes
+            min_narration_words=5,
+            max_narration_words=50,
+            video_fps=30,
+            tts_inference_mode="local",
+            voice_id=context.params.get("voice_id", "zh-CN-YunjianNeural"),
+            tts_speed=context.params.get("tts_speed", 1.2),
+            media_width=media_width,
+            media_height=media_height,
+            frame_template=template_name,
+            template_params=context.params.get("template_params")
+        )
+        
+        # Create Storyboard
+        context.storyboard = Storyboard(
+            title=context.title,
+            config=context.config,
+            created_at=datetime.now()
+        )
+        
+        # Create StoryboardFrames - one per scene
+        for i, scene in enumerate(context.matched_scenes):
+            # Get first narration for the frame (we'll combine audios later)
+            narrations = scene.get("narrations", [scene.get("narration", "")])
+            if isinstance(narrations, str):
+                narrations = [narrations]
+            
+            # Use first narration as the main text (for subtitle)
+            # We'll combine all narrations in the audio
+            main_narration = " ".join(narrations)  # Combine for subtitle display
+            
+            frame = StoryboardFrame(
+                index=i,
+                narration=main_narration,
+                image_prompt=None,  # We're using user assets, not generating images
+                created_at=datetime.now()
+            )
+            
+            # Store matched asset path in the frame
+            frame.image_path = scene["matched_asset"]
+            frame.media_type = "image"
+            
+            # Store scene info for later audio generation
+            frame._scene_data = scene  # Temporary storage for multi-narration
+            
+            context.storyboard.frames.append(frame)
+        
+        logger.info(f"✅ Created storyboard with {len(context.storyboard.frames)} scenes")
+        
+        return context
+    
+    async def produce_assets(self, context: PipelineContext) -> PipelineContext:
+        """
+        Generate scene videos using FrameProcessor (asset + multiple narrations + template)
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Updated context with processed frames
+        """
+        logger.info("🎬 Producing scene videos...")
+        
+        storyboard = context.storyboard
+        config = context.config
+        
+        for i, frame in enumerate(storyboard.frames, 1):
+            logger.info(f"Producing scene {i}/{len(storyboard.frames)}...")
+            
+            # Get scene data with narrations
+            scene = frame._scene_data
+            narrations = scene.get("narrations", [scene.get("narration", "")])
+            if isinstance(narrations, str):
+                narrations = [narrations]
+            
+            logger.info(f"Scene {i} has {len(narrations)} narration(s)")
+            
+            # Step 1: Generate audio for each narration and combine
+            narration_audios = []
+            for j, narration_text in enumerate(narrations, 1):
+                audio_path = Path(context.task_dir) / "frames" / f"{i:02d}_narration_{j}.mp3"
+                audio_path.parent.mkdir(parents=True, exist_ok=True)
+                
+                await self.core.tts(
+                    text=narration_text,
+                    output_path=str(audio_path),
+                    voice_id=config.voice_id,
+                    speed=config.tts_speed
+                )
+                
+                narration_audios.append(str(audio_path))
+                logger.debug(f"  Narration {j}/{len(narrations)}: {narration_text[:30]}...")
+            
+            # Concatenate all narration audios for this scene
+            if len(narration_audios) > 1:
+                from pixelle_video.utils.os_util import get_task_frame_path
+                
+                combined_audio_path = Path(context.task_dir) / "frames" / f"{i:02d}_audio.mp3"
+                
+                # Use FFmpeg to concatenate audio files
+                import subprocess
+                
+                # Create a file list for FFmpeg concat
+                filelist_path = Path(context.task_dir) / "frames" / f"{i:02d}_audiolist.txt"
+                with open(filelist_path, 'w') as f:
+                    for audio_file in narration_audios:
+                        escaped_path = str(Path(audio_file).absolute()).replace("'", "'\\''")
+                        f.write(f"file '{escaped_path}'\n")
+                
+                # Concatenate audio files
+                concat_cmd = [
+                    'ffmpeg',
+                    '-f', 'concat',
+                    '-safe', '0',
+                    '-i', str(filelist_path),
+                    '-c', 'copy',
+                    '-y',
+                    str(combined_audio_path)
+                ]
+                
+                subprocess.run(concat_cmd, check=True, capture_output=True)
+                frame.audio_path = str(combined_audio_path)
+                
+                logger.info(f"✅ Combined {len(narration_audios)} narrations into one audio")
+            else:
+                frame.audio_path = narration_audios[0]
+            
+            # Step 2: Use FrameProcessor to generate composed frame and video
+            # FrameProcessor will handle:
+            # - Template rendering (with proper dimensions)
+            # - Subtitle composition
+            # - Video segment creation
+            # - Proper file naming in frames/
+            
+            # Since we already have the audio and image, we bypass some steps
+            # by manually calling the composition steps
+            
+            # Get audio duration for frame duration
+            import subprocess
+            duration_cmd = [
+                'ffprobe',
+                '-v', 'error',
+                '-show_entries', 'format=duration',
+                '-of', 'default=noprint_wrappers=1:nokey=1',
+                frame.audio_path
+            ]
+            duration_result = subprocess.run(duration_cmd, capture_output=True, text=True, check=True)
+            frame.duration = float(duration_result.stdout.strip())
+            
+            # Use FrameProcessor for proper composition
+            processed_frame = await self.core.frame_processor(
+                frame=frame,
+                storyboard=storyboard,
+                config=config,
+                total_frames=len(storyboard.frames)
+            )
+            
+            logger.success(f"✅ Scene {i} complete")
+        
+        return context
+    
+    async def post_production(self, context: PipelineContext) -> PipelineContext:
+        """
+        Concatenate scene videos and add BGM
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Updated context with final video path
+        """
+        logger.info("🎞️ Concatenating scenes...")
+        
+        # Collect video segments from storyboard frames
+        scene_videos = [frame.video_segment_path for frame in context.storyboard.frames]
+        
+        final_video_path = Path(context.task_dir) / f"{context.title}.mp4"
+        
+        # Get BGM parameters
+        bgm_path = context.request.get("bgm_path")
+        bgm_volume = context.request.get("bgm_volume", 0.2)
+        bgm_mode = context.request.get("bgm_mode", "loop")
+        
+        if bgm_path:
+            logger.info(f"🎵 Adding BGM: {bgm_path} (volume={bgm_volume}, mode={bgm_mode})")
+        
+        self.core.video.concat_videos(
+            videos=scene_videos,
+            output=str(final_video_path),
+            bgm_path=bgm_path,
+            bgm_volume=bgm_volume,
+            bgm_mode=bgm_mode
+        )
+        
+        context.final_video_path = str(final_video_path)
+        context.storyboard.final_video_path = str(final_video_path)
+        
+        logger.success(f"✅ Final video: {final_video_path}")
+        
+        return context
+    
+    async def finalize(self, context: PipelineContext) -> PipelineContext:
+        """
+        Finalize and return result
+        
+        Args:
+            context: Pipeline context
+        
+        Returns:
+            Final context
+        """
+        logger.success(f"🎉 Asset-based video generation complete!")
+        logger.info(f"Video: {context.final_video_path}")
+        
+        return context
+    
+    # Helper methods
+    
+    def _get_asset_type(self, path: Path) -> str:
+        """Determine asset type from file extension"""
+        image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp"}
+        video_exts = {".mp4", ".mov", ".avi", ".mkv", ".webm"}
+        
+        ext = path.suffix.lower()
+        
+        if ext in image_exts:
+            return "image"
+        elif ext in video_exts:
+            return "video"
+        else:
+            return "unknown"
+    
--- a/pixelle_video/service.py
+++ b/pixelle_video/service.py
@@ -27,12 +27,14 @@ from pixelle_video.config import config_manager
 from pixelle_video.services.llm_service import LLMService
 from pixelle_video.services.tts_service import TTSService
 from pixelle_video.services.media import MediaService
+from pixelle_video.services.image_analysis import ImageAnalysisService
 from pixelle_video.services.video import VideoService
 from pixelle_video.services.frame_processor import FrameProcessor
 from pixelle_video.services.persistence import PersistenceService
 from pixelle_video.services.history_manager import HistoryManager
 from pixelle_video.pipelines.standard import StandardPipeline
 from pixelle_video.pipelines.custom import CustomPipeline
+from pixelle_video.pipelines.asset_based import AssetBasedPipeline


 class PixelleVideoCore:
@@ -184,9 +186,12 @@ class PixelleVideoCore:
        logger.info("🚀 Initializing Pixelle-Video...")
        
        # 1. Initialize core services (ComfyKit will be lazy-loaded later)
+        # Initialize services
        self.llm = LLMService(self.config)
-        self.tts = TTSService(self.config, self)
-        self.media = MediaService(self.config, self)
+        self.tts = TTSService(self.config, core=self)
+        self.media = MediaService(self.config, core=self)
+        self.image = self.media  # Alias for backward compatibility
+        self.image_analysis = ImageAnalysisService(self.config, core=self)
        self.video = VideoService()
        self.frame_processor = FrameProcessor(self)
        self.persistence = PersistenceService(output_dir="output")
@@ -196,6 +201,7 @@ class PixelleVideoCore:
        self.pipelines = {
            "standard": StandardPipeline(self),
            "custom": CustomPipeline(self),
+            "asset_based": AssetBasedPipeline(self),
        }
        logger.info(f"📹 Registered pipelines: {', '.join(self.pipelines.keys())}")
        
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -73,23 +73,29 @@ class FrameProcessor:
        frame_num = frame.index + 1
        
        # Determine if this frame needs image generation
-        needs_image = frame.image_prompt is not None
+        # If image_path is already set (e.g. asset-based pipeline), we consider it "needs image" but skip generation
+        has_existing_image = frame.image_path is not None
+        needs_generation = frame.image_prompt is not None
        
        try:
            # Step 1: Generate audio (TTS)
-            if progress_callback:
-                progress_callback(ProgressEvent(
-                    event_type="frame_step",
-                    progress=0.0,
-                    frame_current=frame_num,
-                    frame_total=total_frames,
-                    step=1,
-                    action="audio"
-                ))
-            await self._step_generate_audio(frame, config)
+            if not frame.audio_path:
+                if progress_callback:
+                    progress_callback(ProgressEvent(
+                        event_type="frame_step",
+                        progress=0.0,
+                        frame_current=frame_num,
+                        frame_total=total_frames,
+                        step=1,
+                        action="audio"
+                    ))
+                await self._step_generate_audio(frame, config)
+            else:
+                logger.debug(f"  1/4: Using existing audio: {frame.audio_path}")
            
            # Step 2: Generate media (image or video, conditional)
-            if needs_image:
+            # Step 2: Generate media (image or video, conditional)
+            if needs_generation:
                if progress_callback:
                    progress_callback(ProgressEvent(
                        event_type="frame_step",
@@ -100,16 +106,18 @@ class FrameProcessor:
                        action="media"
                    ))
                await self._step_generate_media(frame, config)
+            elif has_existing_image:
+                logger.debug(f"  2/4: Using existing image: {frame.image_path}")
            else:
                frame.image_path = None
                frame.media_type = None
                logger.debug(f"  2/4: Skipped media generation (not required by template)")
-            
+        
            # Step 3: Compose frame (add subtitle)
            if progress_callback:
                progress_callback(ProgressEvent(
                    event_type="frame_step",
-                    progress=0.50 if needs_image else 0.33,
+                    progress=0.50 if (needs_generation or has_existing_image) else 0.33,
                    frame_current=frame_num,
                    frame_total=total_frames,
                    step=3,
@@ -121,17 +129,18 @@ class FrameProcessor:
            if progress_callback:
                progress_callback(ProgressEvent(
                    event_type="frame_step",
-                    progress=0.75 if needs_image else 0.67,
+                    progress=0.75 if (needs_generation or has_existing_image) else 0.67,
                    frame_current=frame_num,
                    frame_total=total_frames,
                    step=4,
                    action="video"
                ))
+            
            await self._step_create_video_segment(frame, config)
            
            logger.info(f"✅ Frame {frame.index} completed")
            return frame
-            
+
        except Exception as e:
            logger.error(f"❌ Failed to process frame {frame.index}: {e}")
            raise
@@ -303,6 +312,9 @@ class FrameProcessor:
        
        # Generate frame using HTML (size is auto-parsed from template path)
        generator = HTMLFrameGenerator(template_path)
+        
+        logger.debug(f"Generating frame with image: '{frame.image_path}' (type: {type(frame.image_path)})")
+        
        composed_path = await generator.generate_frame(
            title=storyboard.title,
            text=frame.narration,
--- a/pixelle_video/services/image_analysis.py
+++ b/pixelle_video/services/image_analysis.py
@@ -0,0 +1,197 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Image Analysis Service - ComfyUI Workflow-based implementation
+
+Uses Florence-2 or other vision models to analyze images and generate descriptions.
+"""
+
+from typing import Optional, Literal
+from pathlib import Path
+
+from comfykit import ComfyKit
+from loguru import logger
+
+from pixelle_video.services.comfy_base_service import ComfyBaseService
+
+
+class ImageAnalysisService(ComfyBaseService):
+    """
+    Image analysis service - Workflow-based
+    
+    Uses ComfyKit to execute image analysis workflows (e.g., Florence-2, BLIP, etc.).
+    Returns detailed textual descriptions of images.
+    
+    Convention: workflows follow {source}/analyse_image.json pattern
+    - runninghub/analyse_image.json (default, cloud-based)
+    - selfhost/analyse_image.json (local ComfyUI)
+    
+    Usage:
+        # Use default (runninghub cloud)
+        description = await pixelle_video.image_analysis("path/to/image.jpg")
+        
+        # Use local ComfyUI
+        description = await pixelle_video.image_analysis(
+            "path/to/image.jpg",
+            source="selfhost"
+        )
+        
+        # List available workflows
+        workflows = pixelle_video.image_analysis.list_workflows()
+    """
+    
+    WORKFLOW_PREFIX = "analyse_"
+    WORKFLOWS_DIR = "workflows"
+    
+    def __init__(self, config: dict, core=None):
+        """
+        Initialize image analysis service
+        
+        Args:
+            config: Full application config dict
+            core: PixelleVideoCore instance (for accessing shared ComfyKit)
+        """
+        super().__init__(config, service_name="image_analysis", core=core)
+    
+    async def __call__(
+        self,
+        image_path: str,
+        # Workflow source selection
+        source: Literal['runninghub', 'selfhost'] = 'runninghub',
+        workflow: Optional[str] = None,
+        # ComfyUI connection (optional overrides)
+        comfyui_url: Optional[str] = None,
+        runninghub_api_key: Optional[str] = None,
+        # Additional workflow parameters
+        **params
+    ) -> str:
+        """
+        Analyze an image using workflow
+        
+        Args:
+            image_path: Path to the image file (local or URL)
+            source: Workflow source - 'runninghub' (cloud, default) or 'selfhost' (local ComfyUI)
+            workflow: Workflow filename (optional, overrides source-based resolution)
+            comfyui_url: ComfyUI URL (optional, overrides config)
+            runninghub_api_key: RunningHub API key (optional, overrides config)
+            **params: Additional workflow parameters
+        
+        Returns:
+            str: Text description of the image
+        
+        Examples:
+            # Simplest: use default (runninghub cloud)
+            description = await pixelle_video.image_analysis("temp/06.JPG")
+            
+            # Use local ComfyUI
+            description = await pixelle_video.image_analysis(
+                "temp/06.JPG",
+                source="selfhost"
+            )
+            
+            # Use specific workflow (bypass source-based resolution)
+            description = await pixelle_video.image_analysis(
+                "temp/06.JPG",
+                workflow="selfhost/custom_analysis.json"
+            )
+        """
+        from pixelle_video.utils.workflow_util import resolve_workflow_path
+        
+        # 1. Validate image path
+        image_path_obj = Path(image_path)
+        if not image_path_obj.exists():
+            raise FileNotFoundError(f"Image file not found: {image_path}")
+        
+        # 2. Resolve workflow path using convention
+        if workflow is None:
+            # Use standardized naming: {source}/analyse_image.json
+            workflow = resolve_workflow_path("analyse_image", source)
+            logger.info(f"Using {source} workflow: {workflow}")
+        
+        # 2. Resolve workflow (returns structured info)
+        workflow_info = self._resolve_workflow(workflow=workflow)
+        
+        # 3. Build workflow parameters
+        workflow_params = {
+            "image": str(image_path)  # Pass image path to workflow
+        }
+        
+        # Add any additional parameters
+        workflow_params.update(params)
+        
+        logger.debug(f"Workflow parameters: {workflow_params}")
+        
+        # 4. Execute workflow using shared ComfyKit instance from core
+        try:
+            # Get shared ComfyKit instance (lazy initialization + config hot-reload)
+            kit = await self.core._get_or_create_comfykit()
+            
+            # Determine what to pass to ComfyKit based on source
+            if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
+                # RunningHub: pass workflow_id
+                workflow_input = workflow_info["workflow_id"]
+                logger.info(f"Executing RunningHub workflow: {workflow_input}")
+            else:
+                # Selfhost: pass file path
+                workflow_input = workflow_info["path"]
+                logger.info(f"Executing selfhost workflow: {workflow_input}")
+            
+            result = await kit.execute(workflow_input, workflow_params)
+            
+            # 5. Extract description from result
+            if result.status != "completed":
+                error_msg = result.msg or "Unknown error"
+                logger.error(f"Image analysis failed: {error_msg}")
+                raise Exception(f"Image analysis failed: {error_msg}")
+            
+            # Extract text description from result (format varies by source)
+            description = None
+            
+            # Try format 1: Selfhost outputs (direct text in outputs)
+            # Format: {'6': {'text': ['description text']}}
+            if result.outputs:
+                for node_id, node_output in result.outputs.items():
+                    if 'text' in node_output:
+                        text_list = node_output['text']
+                        if text_list and len(text_list) > 0:
+                            description = text_list[0]
+                            break
+            
+            # Try format 2: RunningHub raw_data (text file URL)
+            # Format: {'raw_data': [{'fileUrl': 'https://...txt', 'fileType': 'txt', ...}]}
+            if not description and result.outputs and 'raw_data' in result.outputs:
+                raw_data = result.outputs['raw_data']
+                if raw_data and len(raw_data) > 0:
+                    # Find text file entry
+                    for item in raw_data:
+                        if item.get('fileType') == 'txt' and 'fileUrl' in item:
+                            # Download text content from URL
+                            import aiohttp
+                            async with aiohttp.ClientSession() as session:
+                                async with session.get(item['fileUrl']) as resp:
+                                    if resp.status == 200:
+                                        description = await resp.text()
+                                        description = description.strip()
+                                        break
+            
+            if not description:
+                logger.error(f"No text found in outputs: {result.outputs}")
+                raise Exception("No description generated")
+            
+            logger.info(f"✅ Image analyzed: {description[:100]}...")
+            
+            return description
+        
+        except Exception as e:
+            logger.error(f"Image analysis error: {e}")
+            raise
--- a/pixelle_video/services/llm_service.py
+++ b/pixelle_video/services/llm_service.py
@@ -12,15 +12,22 @@

 """
 LLM (Large Language Model) Service - Direct OpenAI SDK implementation
+
+Supports structured output via response_type parameter (Pydantic model).
 """

-import os
-from typing import Optional
+import json
+import re
+from typing import Optional, Type, TypeVar, Union

 from openai import AsyncOpenAI
+from pydantic import BaseModel
 from loguru import logger


+T = TypeVar("T", bound=BaseModel)
+
+
 class LLMService:
    """
    LLM (Large Language Model) service
@@ -114,8 +121,9 @@ class LLMService:
        model: Optional[str] = None,
        temperature: float = 0.7,
        max_tokens: int = 2000,
+        response_type: Optional[Type[T]] = None,
        **kwargs
-    ) -> str:
+    ) -> Union[str, T]:
        """
        Generate text using LLM
        
@@ -126,24 +134,28 @@ class LLMService:
            model: Model name (optional, uses config if not provided)
            temperature: Sampling temperature (0.0-2.0). Lower is more deterministic.
            max_tokens: Maximum tokens to generate
+            response_type: Optional Pydantic model class for structured output.
+                          If provided, returns parsed model instance instead of string.
            **kwargs: Additional provider-specific parameters
        
        Returns:
-            Generated text
+            Generated text (str) or parsed Pydantic model instance (if response_type provided)
        
        Examples:
-            # Use config from config.yaml
+            # Basic text generation
            answer = await pixelle_video.llm("Explain atomic habits")
            
-            # Override with custom parameters
-            answer = await pixelle_video.llm(
-                prompt="Explain atomic habits in 3 sentences",
-                api_key="sk-custom-key",
-                base_url="https://api.custom.com/v1",
-                model="custom-model",
-                temperature=0.7,
-                max_tokens=500
+            # Structured output with Pydantic model
+            class MovieReview(BaseModel):
+                title: str
+                rating: int
+                summary: str
+            
+            review = await pixelle_video.llm(
+                prompt="Review the movie Inception",
+                response_type=MovieReview
            )
+            print(review.title)  # Structured access
        """
        # Create client (new instance each time to support parameter overrides)
        client = self._create_client(api_key=api_key, base_url=base_url)
@@ -155,25 +167,143 @@ class LLMService:
            or "gpt-3.5-turbo"  # Default fallback
        )
        
-        logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}")
+        logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}, response_type={response_type}")
        
        try:
-            response = await client.chat.completions.create(
-                model=final_model,
+            if response_type is not None:
+                # Structured output mode - try beta.chat.completions.parse first
+                return await self._call_with_structured_output(
+                    client=client,
+                    model=final_model,
+                    prompt=prompt,
+                    response_type=response_type,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    **kwargs
+                )
+            else:
+                # Standard text output mode
+                response = await client.chat.completions.create(
+                    model=final_model,
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    **kwargs
+                )
+                
+                result = response.choices[0].message.content
+                logger.debug(f"LLM response length: {len(result)} chars")
+                
+                return result
+        
+        except Exception as e:
+            logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
+            raise
+    
+    async def _call_with_structured_output(
+        self,
+        client: AsyncOpenAI,
+        model: str,
+        prompt: str,
+        response_type: Type[T],
+        temperature: float,
+        max_tokens: int,
+        **kwargs
+    ) -> T:
+        """
+        Call LLM with structured output support
+        
+        Tries OpenAI beta.chat.completions.parse first, falls back to JSON parsing
+        if the provider doesn't support structured outputs.
+        
+        Args:
+            client: OpenAI client
+            model: Model name
+            prompt: The prompt
+            response_type: Pydantic model class
+            temperature: Sampling temperature
+            max_tokens: Max tokens
+            **kwargs: Additional parameters
+        
+        Returns:
+            Parsed Pydantic model instance
+        """
+        # Try OpenAI structured output API first (beta.chat.completions.parse)
+        try:
+            response = await client.beta.chat.completions.parse(
+                model=model,
                messages=[{"role": "user", "content": prompt}],
+                response_format=response_type,
                temperature=temperature,
                max_tokens=max_tokens,
                **kwargs
            )
            
-            result = response.choices[0].message.content
-            logger.debug(f"LLM response length: {len(result)} chars")
+            parsed = response.choices[0].message.parsed
+            if parsed is not None:
+                logger.debug(f"Structured output parsed successfully via beta API")
+                return parsed
+            
+            # If parsed is None, fall through to fallback
+            logger.warning("Structured output API returned None, falling back to JSON parsing")
+            content = response.choices[0].message.content
            
-            return result
-        
        except Exception as e:
-            logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
-            raise
+            # If beta API not supported, fall back to JSON mode
+            logger.debug(f"Structured output API not available ({e}), falling back to JSON parsing")
+            
+            response = await client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=temperature,
+                max_tokens=max_tokens,
+                **kwargs
+            )
+            content = response.choices[0].message.content
+        
+        # Fallback: Parse JSON from response content
+        return self._parse_response_as_model(content, response_type)
+    
+    def _parse_response_as_model(self, content: str, response_type: Type[T]) -> T:
+        """
+        Parse LLM response content as Pydantic model
+        
+        Args:
+            content: Raw LLM response text
+            response_type: Target Pydantic model class
+        
+        Returns:
+            Parsed model instance
+        """
+        # Try direct JSON parsing first
+        try:
+            data = json.loads(content)
+            return response_type.model_validate(data)
+        except json.JSONDecodeError:
+            pass
+        
+        # Try extracting from markdown code block
+        json_pattern = r'```(?:json)?\s*([\s\S]+?)\s*```'
+        match = re.search(json_pattern, content, re.DOTALL)
+        if match:
+            try:
+                data = json.loads(match.group(1))
+                return response_type.model_validate(data)
+            except json.JSONDecodeError:
+                pass
+        
+        # Try to find any JSON object in the text
+        brace_start = content.find('{')
+        brace_end = content.rfind('}')
+        if brace_start != -1 and brace_end > brace_start:
+            try:
+                json_str = content[brace_start:brace_end + 1]
+                data = json.loads(json_str)
+                return response_type.model_validate(data)
+            except json.JSONDecodeError:
+                pass
+        
+        raise ValueError(f"Failed to parse LLM response as {response_type.__name__}: {content[:200]}...")
    
    @property
    def active(self) -> str:
--- a/pixelle_video/utils/workflow_util.py
+++ b/pixelle_video/utils/workflow_util.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Workflow Path Resolver
+
+Standardized workflow path resolution for all ComfyUI services.
+Convention: {source}/{service}.json
+
+Examples:
+    - Image analysis: selfhost/analyse_image.json, runninghub/analyse_image.json
+    - Image generation: selfhost/image.json, runninghub/image.json
+    - Video generation: selfhost/video.json, runninghub/video.json
+    - TTS: selfhost/tts.json, runninghub/tts.json
+"""
+
+from typing import Literal
+
+WorkflowSource = Literal['runninghub', 'selfhost']
+
+
+def resolve_workflow_path(
+    service_name: str,
+    source: WorkflowSource = 'runninghub'
+) -> str:
+    """
+    Resolve workflow path using standardized naming convention
+    
+    Convention: workflows/{source}/{service_name}.json
+    
+    Args:
+        service_name: Service identifier (e.g., "analyse_image", "image", "video", "tts")
+        source: Workflow source - 'runninghub' (default) or 'selfhost'
+    
+    Returns:
+        Workflow path in format: "{source}/{service_name}.json"
+        
+    Examples:
+        >>> resolve_workflow_path("analyse_image", "runninghub")
+        'runninghub/analyse_image.json'
+        
+        >>> resolve_workflow_path("analyse_image", "selfhost")
+        'selfhost/analyse_image.json'
+        
+        >>> resolve_workflow_path("image")  # defaults to runninghub
+        'runninghub/image.json'
+    """
+    return f"{source}/{service_name}.json"
+
+
+def get_default_source() -> WorkflowSource:
+    """
+    Get default workflow source
+    
+    Returns:
+        'runninghub' - Cloud-first approach, better for beginners
+    """
+    return 'runninghub'
--- a/workflows/runninghub/analyse_image.json
+++ b/workflows/runninghub/analyse_image.json
@@ -0,0 +1,4 @@
+{
+    "source": "runninghub",
+    "workflow_id": "1996069253201739777"
+}
--- a/workflows/selfhost/analyse_image.json
+++ b/workflows/selfhost/analyse_image.json
@@ -0,0 +1,61 @@
+{
+  "3": {
+    "inputs": {
+      "model": "microsoft/Florence-2-large",
+      "precision": "fp16",
+      "attention": "sdpa",
+      "convert_to_safetensors": false
+    },
+    "class_type": "DownloadAndLoadFlorence2Model",
+    "_meta": {
+      "title": "DownloadAndLoadFlorence2Model"
+    }
+  },
+  "4": {
+    "inputs": {
+      "text_input": "",
+      "task": "more_detailed_caption",
+      "fill_mask": true,
+      "keep_model_loaded": false,
+      "max_new_tokens": 1024,
+      "num_beams": 3,
+      "do_sample": true,
+      "output_mask_select": "",
+      "seed": 853848678279928,
+      "image": [
+        "5",
+        0
+      ],
+      "florence2_model": [
+        "3",
+        0
+      ]
+    },
+    "class_type": "Florence2Run",
+    "_meta": {
+      "title": "Florence2Run"
+    }
+  },
+  "5": {
+    "inputs": {
+      "image": "06.JPG"
+    },
+    "class_type": "LoadImage",
+    "_meta": {
+      "title": "$image.image"
+    }
+  },
+  "6": {
+    "inputs": {
+      "text": "The image shows a white cat sitting on a black and white striped stool against a white wall. The cat is wearing a blue knitted sweater and is looking directly at the camera with a curious expression. Its ears are perked up and its eyes are wide open, giving it an alert and inquisitive look. The background is plain white, making the cat the focal point of the image.",
+      "anything": [
+        "4",
+        2
+      ]
+    },
+    "class_type": "easy showAnything",
+    "_meta": {
+      "title": "Show Any"
+    }
+  }
+}