# Copyright (C) 2025 AIDC-AI # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Asset-Based Video Pipeline Generates marketing videos from user-provided assets (images/videos) rather than AI-generated media. Ideal for small businesses with existing media libraries. Workflow: 1. Analyze uploaded assets (images/videos) 2. Generate script based on user intent and available assets 3. Match assets to script scenes 4. Compose final video with narrations Example: pipeline = AssetBasedPipeline(pixelle_video) result = await pipeline( assets=["/path/img1.jpg", "/path/img2.jpg"], video_title="Pet Store Year-End Sale", intent="Promote our pet store's year-end sale with a warm and friendly tone", duration=30 ) """ from typing import List, Dict, Any, Optional, Callable from pathlib import Path from loguru import logger from pydantic import BaseModel, Field from pixelle_video.pipelines.linear import LinearVideoPipeline, PipelineContext from pixelle_video.models.progress import ProgressEvent from pixelle_video.utils.os_util import ( create_task_output_dir, get_task_final_video_path ) # Type alias for progress callback ProgressCallback = Optional[Callable[[ProgressEvent], None]] # ==================== Structured Output Models ==================== class SceneScript(BaseModel): """Single scene in the video script""" scene_number: int = Field(description="Scene number starting from 1") asset_path: str = Field(description="Path to the asset file for this scene") narrations: List[str] = Field(description="List of narration sentences for this scene (1-5 sentences)") duration: int = Field(description="Estimated duration in seconds for this scene") class VideoScript(BaseModel): """Complete video script with scenes""" scenes: List[SceneScript] = Field(description="List of scenes in the video") class AssetBasedPipeline(LinearVideoPipeline): """ Asset-Based Video Pipeline Generates videos from user-provided assets instead of AI-generated media. """ def __init__(self, core): """ Initialize pipeline Args: core: PixelleVideoCore instance """ super().__init__(core) self.asset_index: Dict[str, Any] = {} # In-memory asset metadata async def __call__( self, assets: List[str], video_title: str = "", intent: Optional[str] = None, duration: int = 30, source: str = "runninghub", bgm_path: Optional[str] = None, bgm_volume: float = 0.2, bgm_mode: str = "loop", progress_callback: ProgressCallback = None, **kwargs ) -> PipelineContext: """ Execute pipeline with user-provided assets Args: assets: List of asset file paths video_title: Video title intent: Video intent/purpose (defaults to video_title) duration: Target duration in seconds source: Workflow source ("runninghub" or "selfhost") bgm_path: Path to background music file (optional) bgm_volume: BGM volume (0.0-1.0, default 0.2) bgm_mode: BGM mode ("loop" or "once", default "loop") progress_callback: Optional callback for progress updates **kwargs: Additional parameters Returns: Pipeline context with generated video """ from pixelle_video.pipelines.linear import PipelineContext # Store progress callback self._progress_callback = progress_callback # Create custom context with asset-specific parameters ctx = PipelineContext( input_text=intent or video_title, # Use intent or title as input_text params={ "assets": assets, "video_title": video_title, "intent": intent or video_title, "duration": duration, "source": source, "bgm_path": bgm_path, "bgm_volume": bgm_volume, "bgm_mode": bgm_mode, **kwargs } ) # Store request parameters in context for easy access ctx.request = ctx.params try: # Execute pipeline lifecycle await self.setup_environment(ctx) await self.determine_title(ctx) await self.generate_content(ctx) await self.plan_visuals(ctx) await self.initialize_storyboard(ctx) await self.produce_assets(ctx) await self.post_production(ctx) await self.finalize(ctx) return ctx except Exception as e: await self.handle_exception(ctx, e) raise def _emit_progress(self, event: ProgressEvent): """Emit progress event to callback if available""" if self._progress_callback: self._progress_callback(event) async def setup_environment(self, context: PipelineContext) -> PipelineContext: """ Analyze uploaded assets and build asset index Args: context: Pipeline context with assets list Returns: Updated context with asset_index """ # Create isolated task directory task_dir, task_id = create_task_output_dir() context.task_id = task_id context.task_dir = Path(task_dir) # Convert to Path for easier usage # Determine final video path context.final_video_path = get_task_final_video_path(task_id) logger.info(f"📁 Task directory created: {task_dir}") logger.info("🔍 Analyzing uploaded assets...") assets: List[str] = context.request.get("assets", []) if not assets: raise ValueError("No assets provided. Please upload at least one image or video.") total_assets = len(assets) logger.info(f"Found {total_assets} assets to analyze") # Emit initial progress (0-15% for asset analysis) self._emit_progress(ProgressEvent( event_type="analyzing_assets", progress=0.01, frame_current=0, frame_total=total_assets, extra_info="start" )) self.asset_index = {} for i, asset_path in enumerate(assets, 1): asset_path_obj = Path(asset_path) if not asset_path_obj.exists(): logger.warning(f"Asset not found: {asset_path}") continue logger.info(f"Analyzing asset {i}/{total_assets}: {asset_path_obj.name}") # Emit progress for this asset progress = 0.01 + (i - 1) / total_assets * 0.14 # 1% - 15% self._emit_progress(ProgressEvent( event_type="analyzing_asset", progress=progress, frame_current=i, frame_total=total_assets, extra_info=asset_path_obj.name )) # Determine asset type asset_type = self._get_asset_type(asset_path_obj) if asset_type == "image": # Analyze image using ImageAnalysisService analysis_source = context.request.get("source", "runninghub") description = await self.core.image_analysis(asset_path, source=analysis_source) self.asset_index[asset_path] = { "path": asset_path, "type": "image", "name": asset_path_obj.name, "description": description } logger.info(f"✅ Image analyzed: {description[:50]}...") elif asset_type == "video": # Analyze video using VideoAnalysisService analysis_source = context.request.get("source", "runninghub") try: description = await self.core.video_analysis(asset_path, source=analysis_source) self.asset_index[asset_path] = { "path": asset_path, "type": "video", "name": asset_path_obj.name, "description": description } logger.info(f"✅ Video analyzed: {description[:50]}...") except Exception as e: logger.warning(f"Video analysis failed for {asset_path_obj.name}: {e}, using fallback") self.asset_index[asset_path] = { "path": asset_path, "type": "video", "name": asset_path_obj.name, "description": "Video asset (analysis failed)" } else: logger.warning(f"Unknown asset type: {asset_path}") logger.success(f"✅ Asset analysis complete: {len(self.asset_index)} assets indexed") # Store asset index in context context.asset_index = self.asset_index # Emit completion of asset analysis self._emit_progress(ProgressEvent( event_type="analyzing_assets", progress=0.15, frame_current=total_assets, frame_total=total_assets, extra_info="complete" )) return context async def determine_title(self, context: PipelineContext) -> PipelineContext: """ Use user-provided title if available, otherwise leave empty Args: context: Pipeline context Returns: Updated context with title (may be empty) """ title = context.request.get("video_title") if title: context.title = title logger.info(f"📝 Video title: {title} (user-specified)") else: context.title = "" logger.info(f"📝 No video title specified (will be hidden in template)") return context async def generate_content(self, context: PipelineContext) -> PipelineContext: """ Generate video script using LLM with structured output LLM directly assigns assets to scenes - no complex matching logic needed. Args: context: Pipeline context Returns: Updated context with generated script (scenes already have asset_path assigned) """ from pixelle_video.prompts.asset_script_generation import build_asset_script_prompt logger.info("🤖 Generating video script with LLM...") # Emit progress for script generation (15% - 25%) self._emit_progress(ProgressEvent( event_type="generating_script", progress=0.16 )) # Build prompt for LLM intent = context.request.get("intent", context.input_text) duration = context.request.get("duration", 30) title = context.title # May be empty if user didn't provide one # Prepare asset descriptions with full paths for LLM to reference asset_info = [] for asset_path, metadata in self.asset_index.items(): asset_info.append(f"- Path: {asset_path}\n Description: {metadata['description']}") assets_text = "\n".join(asset_info) # Build prompt using the centralized prompt function prompt = build_asset_script_prompt( intent=intent, duration=duration, assets_text=assets_text, title=title ) # Call LLM with structured output script: VideoScript = await self.core.llm( prompt=prompt, response_type=VideoScript, temperature=0.8, max_tokens=4000 ) # Convert to dict format for compatibility with downstream code context.script = [scene.model_dump() for scene in script.scenes] # Validate asset paths exist for scene in context.script: asset_path = scene.get("asset_path") if asset_path not in self.asset_index: # Find closest match (in case LLM slightly modified the path) matched = False for known_path in self.asset_index.keys(): if Path(known_path).name == Path(asset_path).name: scene["asset_path"] = known_path matched = True logger.warning(f"Corrected asset path: {asset_path} -> {known_path}") break if not matched: # Fallback to first available asset fallback_path = list(self.asset_index.keys())[0] logger.warning(f"Unknown asset path '{asset_path}', using fallback: {fallback_path}") scene["asset_path"] = fallback_path logger.success(f"✅ Generated script with {len(context.script)} scenes") # Emit progress after script generation self._emit_progress(ProgressEvent( event_type="generating_script", progress=0.25, extra_info="complete" )) # Log script preview for scene in context.script: narrations = scene.get("narrations", []) if isinstance(narrations, str): narrations = [narrations] narration_preview = " | ".join([n[:30] + "..." if len(n) > 30 else n for n in narrations[:2]]) asset_name = Path(scene.get("asset_path", "unknown")).name logger.info(f"Scene {scene['scene_number']} [{asset_name}]: {narration_preview}") return context async def plan_visuals(self, context: PipelineContext) -> PipelineContext: """ Prepare matched scenes from LLM-generated script Since LLM already assigned asset_path in generate_content, this method simply converts the script format to matched_scenes format. Args: context: Pipeline context Returns: Updated context with matched_scenes """ logger.info("🎯 Preparing scene-asset mapping...") # LLM already assigned asset_path to each scene in generate_content # Just convert to matched_scenes format for downstream compatibility context.matched_scenes = [ { **scene, "matched_asset": scene["asset_path"] # Alias for compatibility } for scene in context.script ] # Log asset usage summary asset_usage = {} for scene in context.matched_scenes: asset = scene["matched_asset"] asset_usage[asset] = asset_usage.get(asset, 0) + 1 logger.info(f"📊 Asset usage summary:") for asset_path, count in asset_usage.items(): logger.info(f" {Path(asset_path).name}: {count} scene(s)") return context async def initialize_storyboard(self, context: PipelineContext) -> PipelineContext: """ Initialize storyboard from matched scenes Args: context: Pipeline context Returns: Updated context with storyboard """ from pixelle_video.models.storyboard import ( Storyboard, StoryboardFrame, StoryboardConfig ) from datetime import datetime # Extract all narrations in order for compatibility all_narrations = [] for scene in context.matched_scenes: narrations = scene.get("narrations", [scene.get("narration", "")]) if isinstance(narrations, str): narrations = [narrations] all_narrations.extend(narrations) context.narrations = all_narrations # Get template dimensions # Use asset_default.html template which supports both image and video assets # (conditionally shows background image or provides transparent overlay) template_name = "1080x1920/asset_default.html" # Extract dimensions from template name (e.g., "1080x1920") try: dims = template_name.split("/")[0].split("x") media_width = int(dims[0]) media_height = int(dims[1]) except: # Default to 1080x1920 media_width = 1080 media_height = 1920 # Create StoryboardConfig context.config = StoryboardConfig( task_id=context.task_id, n_storyboard=len(context.matched_scenes), # Number of scenes min_narration_words=5, max_narration_words=50, video_fps=30, tts_inference_mode="local", voice_id=context.params.get("voice_id", "zh-CN-YunjianNeural"), tts_speed=context.params.get("tts_speed", 1.2), media_width=media_width, media_height=media_height, frame_template=template_name, template_params=context.params.get("template_params") ) # Create Storyboard context.storyboard = Storyboard( title=context.title, config=context.config, created_at=datetime.now() ) # Create StoryboardFrames - one per scene for i, scene in enumerate(context.matched_scenes): # Get first narration for the frame (we'll combine audios later) narrations = scene.get("narrations", [scene.get("narration", "")]) if isinstance(narrations, str): narrations = [narrations] # Use first narration as the main text (for subtitle) # We'll combine all narrations in the audio main_narration = " ".join(narrations) # Combine for subtitle display frame = StoryboardFrame( index=i, narration=main_narration, image_prompt=None, # We're using user assets, not generating images created_at=datetime.now() ) # Get asset path and determine actual media type from asset_index asset_path = scene["matched_asset"] asset_metadata = self.asset_index.get(asset_path, {}) asset_type = asset_metadata.get("type", "image") # Default to image if not found # Set media type and path based on actual asset type if asset_type == "video": frame.media_type = "video" frame.video_path = asset_path logger.debug(f"Scene {i}: Using video asset: {Path(asset_path).name}") else: frame.media_type = "image" frame.image_path = asset_path logger.debug(f"Scene {i}: Using image asset: {Path(asset_path).name}") # Store scene info for later audio generation frame._scene_data = scene # Temporary storage for multi-narration context.storyboard.frames.append(frame) logger.info(f"✅ Created storyboard with {len(context.storyboard.frames)} scenes") return context async def produce_assets(self, context: PipelineContext) -> PipelineContext: """ Generate scene videos using FrameProcessor (asset + multiple narrations + template) Args: context: Pipeline context Returns: Updated context with processed frames """ logger.info("🎬 Producing scene videos...") storyboard = context.storyboard config = context.config total_frames = len(storyboard.frames) # Progress range: 30% - 85% for frame production base_progress = 0.30 progress_range = 0.55 # 85% - 30% for i, frame in enumerate(storyboard.frames, 1): logger.info(f"Producing scene {i}/{total_frames}...") # Emit progress for this frame (each frame has 4 steps: audio, combine, duration, compose) frame_progress = base_progress + (i - 1) / total_frames * progress_range self._emit_progress(ProgressEvent( event_type="frame_step", progress=frame_progress, frame_current=i, frame_total=total_frames, step=1, action="audio" )) # Get scene data with narrations scene = frame._scene_data narrations = scene.get("narrations", [scene.get("narration", "")]) if isinstance(narrations, str): narrations = [narrations] logger.info(f"Scene {i} has {len(narrations)} narration(s)") # Step 1: Generate audio for each narration and combine narration_audios = [] for j, narration_text in enumerate(narrations, 1): audio_path = Path(context.task_dir) / "frames" / f"{i:02d}_narration_{j}.mp3" audio_path.parent.mkdir(parents=True, exist_ok=True) await self.core.tts( text=narration_text, output_path=str(audio_path), voice_id=config.voice_id, speed=config.tts_speed ) narration_audios.append(str(audio_path)) logger.debug(f" Narration {j}/{len(narrations)}: {narration_text[:30]}...") # Concatenate all narration audios for this scene if len(narration_audios) > 1: from pixelle_video.utils.os_util import get_task_frame_path # Emit progress for combining audio frame_progress = base_progress + ((i - 1) + 0.25) / total_frames * progress_range self._emit_progress(ProgressEvent( event_type="frame_step", progress=frame_progress, frame_current=i, frame_total=total_frames, step=2, action="audio" )) combined_audio_path = Path(context.task_dir) / "frames" / f"{i:02d}_audio.mp3" # Use FFmpeg to concatenate audio files import subprocess # Create a file list for FFmpeg concat filelist_path = Path(context.task_dir) / "frames" / f"{i:02d}_audiolist.txt" with open(filelist_path, 'w') as f: for audio_file in narration_audios: escaped_path = str(Path(audio_file).absolute()).replace("'", "'\\''") f.write(f"file '{escaped_path}'\n") # Concatenate audio files concat_cmd = [ 'ffmpeg', '-f', 'concat', '-safe', '0', '-i', str(filelist_path), '-c', 'copy', '-y', str(combined_audio_path) ] subprocess.run(concat_cmd, check=True, capture_output=True) frame.audio_path = str(combined_audio_path) logger.info(f"✅ Combined {len(narration_audios)} narrations into one audio") else: frame.audio_path = narration_audios[0] # Step 2: Use FrameProcessor to generate composed frame and video # FrameProcessor will handle: # - Template rendering (with proper dimensions) # - Subtitle composition # - Video segment creation # - Proper file naming in frames/ # Since we already have the audio and image, we bypass some steps # by manually calling the composition steps # Emit progress for duration calculation frame_progress = base_progress + ((i - 1) + 0.5) / total_frames * progress_range self._emit_progress(ProgressEvent( event_type="frame_step", progress=frame_progress, frame_current=i, frame_total=total_frames, step=3, action="compose" )) # Get audio duration for frame duration import subprocess duration_cmd = [ 'ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', frame.audio_path ] duration_result = subprocess.run(duration_cmd, capture_output=True, text=True, check=True) frame.duration = float(duration_result.stdout.strip()) # Emit progress for video composition frame_progress = base_progress + ((i - 1) + 0.75) / total_frames * progress_range self._emit_progress(ProgressEvent( event_type="frame_step", progress=frame_progress, frame_current=i, frame_total=total_frames, step=4, action="video" )) # Use FrameProcessor for proper composition processed_frame = await self.core.frame_processor( frame=frame, storyboard=storyboard, config=config, total_frames=total_frames ) logger.success(f"✅ Scene {i} complete") # Emit completion of frame production self._emit_progress(ProgressEvent( event_type="processing_frame", progress=0.85, frame_current=total_frames, frame_total=total_frames )) return context async def post_production(self, context: PipelineContext) -> PipelineContext: """ Concatenate scene videos and add BGM Args: context: Pipeline context Returns: Updated context with final video path """ logger.info("🎞️ Concatenating scenes...") # Emit progress for concatenation (85% - 95%) self._emit_progress(ProgressEvent( event_type="concatenating", progress=0.86 )) # Collect video segments from storyboard frames scene_videos = [frame.video_segment_path for frame in context.storyboard.frames] # Generate filename: use title if provided, otherwise use task_id or default name if context.title: filename = f"{context.title}.mp4" else: filename = f"{context.task_id}.mp4" # Use task_id as filename when title is empty final_video_path = Path(context.task_dir) / filename # Get BGM parameters bgm_path = context.request.get("bgm_path") bgm_volume = context.request.get("bgm_volume", 0.2) bgm_mode = context.request.get("bgm_mode", "loop") if bgm_path: logger.info(f"🎵 Adding BGM: {bgm_path} (volume={bgm_volume}, mode={bgm_mode})") self.core.video.concat_videos( videos=scene_videos, output=str(final_video_path), bgm_path=bgm_path, bgm_volume=bgm_volume, bgm_mode=bgm_mode ) context.final_video_path = str(final_video_path) context.storyboard.final_video_path = str(final_video_path) logger.success(f"✅ Final video: {final_video_path}") # Emit completion of concatenation self._emit_progress(ProgressEvent( event_type="concatenating", progress=0.95, extra_info="complete" )) return context async def finalize(self, context: PipelineContext) -> PipelineContext: """ Finalize and return result Args: context: Pipeline context Returns: Final context """ logger.success(f"🎉 Asset-based video generation complete!") logger.info(f"Video: {context.final_video_path}") # Emit completion self._emit_progress(ProgressEvent( event_type="completed", progress=1.0 )) # Persist metadata for history tracking await self._persist_task_data(context) return context async def _persist_task_data(self, ctx: PipelineContext): """ Persist task metadata and storyboard to filesystem for history tracking """ from pathlib import Path try: storyboard = ctx.storyboard task_id = ctx.task_id if not task_id: logger.warning("No task_id in context, skipping persistence") return # Get file size video_path_obj = Path(ctx.final_video_path) file_size = video_path_obj.stat().st_size if video_path_obj.exists() else 0 # Build metadata input_params = { "text": ctx.input_text, "mode": "asset_based", "title": ctx.title or "", "n_scenes": len(storyboard.frames) if storyboard else 0, "assets": ctx.request.get("assets", []), "intent": ctx.request.get("intent"), "duration": ctx.request.get("duration"), "source": ctx.request.get("source"), "voice_id": ctx.request.get("voice_id"), "tts_speed": ctx.request.get("tts_speed"), } metadata = { "task_id": task_id, "created_at": storyboard.created_at.isoformat() if storyboard and storyboard.created_at else None, "completed_at": storyboard.completed_at.isoformat() if storyboard and storyboard.completed_at else None, "status": "completed", "input": input_params, "result": { "video_path": ctx.final_video_path, "duration": storyboard.total_duration if storyboard else 0, "file_size": file_size, "n_frames": len(storyboard.frames) if storyboard else 0 }, "config": { "llm_model": self.core.config.get("llm", {}).get("model", "unknown"), "llm_base_url": self.core.config.get("llm", {}).get("base_url", "unknown"), "source": ctx.request.get("source", "runninghub"), } } # Save metadata await self.core.persistence.save_task_metadata(task_id, metadata) logger.info(f"💾 Saved task metadata: {task_id}") # Save storyboard if storyboard: await self.core.persistence.save_storyboard(task_id, storyboard) logger.info(f"💾 Saved storyboard: {task_id}") except Exception as e: logger.error(f"Failed to persist task data: {e}") # Don't raise - persistence failure shouldn't break video generation # Helper methods def _get_asset_type(self, path: Path) -> str: """Determine asset type from file extension""" image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp"} video_exts = {".mp4", ".mov", ".avi", ".mkv", ".webm"} ext = path.suffix.lower() if ext in image_exts: return "image" elif ext in video_exts: return "video" else: return "unknown"