开发基于图片素材生成视频的webui功能

2025-12-04 11:14:14 +08:00
parent ea784e0d06
commit 7425b9d23d
8 changed files with 896 additions and 104 deletions
--- a/pixelle_video/pipelines/asset_based.py
+++ b/pixelle_video/pipelines/asset_based.py
@@ -27,23 +27,27 @@ Example:
    result = await pipeline(
        assets=["/path/img1.jpg", "/path/img2.jpg"],
        video_title="Pet Store Year-End Sale",
-        style="warm and friendly",
+        intent="Promote our pet store's year-end sale with a warm and friendly tone",
        duration=30
    )
 """

-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Callable
 from pathlib import Path

 from loguru import logger
 from pydantic import BaseModel, Field

 from pixelle_video.pipelines.linear import LinearVideoPipeline, PipelineContext
+from pixelle_video.models.progress import ProgressEvent
 from pixelle_video.utils.os_util import (
    create_task_output_dir,
    get_task_final_video_path
 )

+# Type alias for progress callback
+ProgressCallback = Optional[Callable[[ProgressEvent], None]]
+

 # ==================== Structured Output Models ====================

@@ -82,12 +86,12 @@ class AssetBasedPipeline(LinearVideoPipeline):
        assets: List[str],
        video_title: str = "",
        intent: Optional[str] = None,
-        style: str = "professional and engaging",
        duration: int = 30,
        source: str = "runninghub",
        bgm_path: Optional[str] = None,
        bgm_volume: float = 0.2,
        bgm_mode: str = "loop",
+        progress_callback: ProgressCallback = None,
        **kwargs
    ) -> PipelineContext:
        """
@@ -97,12 +101,12 @@ class AssetBasedPipeline(LinearVideoPipeline):
            assets: List of asset file paths
            video_title: Video title
            intent: Video intent/purpose (defaults to video_title)
-            style: Video style
            duration: Target duration in seconds
            source: Workflow source ("runninghub" or "selfhost")
            bgm_path: Path to background music file (optional)
            bgm_volume: BGM volume (0.0-1.0, default 0.2)
            bgm_mode: BGM mode ("loop" or "once", default "loop")
+            progress_callback: Optional callback for progress updates
            **kwargs: Additional parameters
        
        Returns:
@@ -110,6 +114,9 @@ class AssetBasedPipeline(LinearVideoPipeline):
        """
        from pixelle_video.pipelines.linear import PipelineContext
        
+        # Store progress callback
+        self._progress_callback = progress_callback
+        
        # Create custom context with asset-specific parameters
        ctx = PipelineContext(
            input_text=intent or video_title,  # Use intent or title as input_text
@@ -117,7 +124,6 @@ class AssetBasedPipeline(LinearVideoPipeline):
                "assets": assets,
                "video_title": video_title,
                "intent": intent or video_title,
-                "style": style,
                "duration": duration,
                "source": source,
                "bgm_path": bgm_path,
@@ -147,6 +153,11 @@ class AssetBasedPipeline(LinearVideoPipeline):
            await self.handle_exception(ctx, e)
            raise
    
+    def _emit_progress(self, event: ProgressEvent):
+        """Emit progress event to callback if available"""
+        if self._progress_callback:
+            self._progress_callback(event)
+    
    async def setup_environment(self, context: PipelineContext) -> PipelineContext:
        """
        Analyze uploaded assets and build asset index
@@ -172,7 +183,17 @@ class AssetBasedPipeline(LinearVideoPipeline):
        if not assets:
            raise ValueError("No assets provided. Please upload at least one image or video.")
        
-        logger.info(f"Found {len(assets)} assets to analyze")
+        total_assets = len(assets)
+        logger.info(f"Found {total_assets} assets to analyze")
+        
+        # Emit initial progress (0-15% for asset analysis)
+        self._emit_progress(ProgressEvent(
+            event_type="analyzing_assets",
+            progress=0.01,
+            frame_current=0,
+            frame_total=total_assets,
+            extra_info="start"
+        ))
        
        self.asset_index = {}
        
@@ -183,7 +204,17 @@ class AssetBasedPipeline(LinearVideoPipeline):
                logger.warning(f"Asset not found: {asset_path}")
                continue
            
-            logger.info(f"Analyzing asset {i}/{len(assets)}: {asset_path_obj.name}")
+            logger.info(f"Analyzing asset {i}/{total_assets}: {asset_path_obj.name}")
+            
+            # Emit progress for this asset
+            progress = 0.01 + (i - 1) / total_assets * 0.14  # 1% - 15%
+            self._emit_progress(ProgressEvent(
+                event_type="analyzing_asset",
+                progress=progress,
+                frame_current=i,
+                frame_total=total_assets,
+                extra_info=asset_path_obj.name
+            ))
            
            # Determine asset type
            asset_type = self._get_asset_type(asset_path_obj)
@@ -222,34 +253,35 @@ class AssetBasedPipeline(LinearVideoPipeline):
        # Store asset index in context
        context.asset_index = self.asset_index
        
+        # Emit completion of asset analysis
+        self._emit_progress(ProgressEvent(
+            event_type="analyzing_assets",
+            progress=0.15,
+            frame_current=total_assets,
+            frame_total=total_assets,
+            extra_info="complete"
+        ))
+        
        return context
    
    async def determine_title(self, context: PipelineContext) -> PipelineContext:
        """
-        Use user-provided title or generate one via LLM
+        Use user-provided title if available, otherwise leave empty
        
        Args:
            context: Pipeline context
        
        Returns:
-            Updated context with title
+            Updated context with title (may be empty)
        """
-        from pixelle_video.utils.content_generators import generate_title
-        
        title = context.request.get("video_title")
        
        if title:
            context.title = title
            logger.info(f"📝 Video title: {title} (user-specified)")
        else:
-            # Generate title from intent using LLM
-            intent = context.request.get("intent", context.input_text)
-            context.title = await generate_title(
-                self.core.llm,
-                content=intent,
-                strategy="llm"
-            )
-            logger.info(f"📝 Video title: {context.title} (LLM-generated)")
+            context.title = ""
+            logger.info(f"📝 No video title specified (will be hidden in template)")
        
        return context
    
@@ -267,10 +299,16 @@ class AssetBasedPipeline(LinearVideoPipeline):
        """
        logger.info("🤖 Generating video script with LLM...")
        
+        # Emit progress for script generation (15% - 25%)
+        self._emit_progress(ProgressEvent(
+            event_type="generating_script",
+            progress=0.16
+        ))
+        
        # Build prompt for LLM
-        intent = context.request.get("intent", context.title)
-        style = context.request.get("style", "professional and engaging")
+        intent = context.request.get("intent", context.input_text)
        duration = context.request.get("duration", 30)
+        title = context.title  # May be empty if user didn't provide one
        
        # Prepare asset descriptions with full paths for LLM to reference
        asset_info = []
@@ -279,11 +317,13 @@ class AssetBasedPipeline(LinearVideoPipeline):
        
        assets_text = "\n".join(asset_info)
        
+        # Build title section for prompt (only if title is provided)
+        title_section = f"- Video Title: {title}\n" if title else ""
+        
        prompt = f"""You are a video script writer. Generate a {duration}-second video script.

 ## Requirements
- Intent: {intent}
- Style: {style}
+{title_section}- Intent: {intent}
 - Target Duration: {duration} seconds

 ## Available Assets (use the exact path in your response)
@@ -295,6 +335,7 @@ class AssetBasedPipeline(LinearVideoPipeline):
 3. Each scene can have 1-5 narration sentences
 4. Try to use all available assets, but it's OK to reuse if needed
 5. Total duration of all scenes should be approximately {duration} seconds
+{f"6. The narrations should align with the video title: {title}" if title else ""}

 ## Output Requirements
 For each scene, provide:
@@ -337,6 +378,13 @@ Generate the video script now:"""
        
        logger.success(f"✅ Generated script with {len(context.script)} scenes")
        
+        # Emit progress after script generation
+        self._emit_progress(ProgressEvent(
+            event_type="generating_script",
+            progress=0.25,
+            extra_info="complete"
+        ))
+        
        # Log script preview
        for scene in context.script:
            narrations = scene.get("narrations", [])
@@ -413,7 +461,7 @@ Generate the video script now:"""
        context.narrations = all_narrations
        
        # Get template dimensions
-        template_name = context.params.get("frame_template", "1080x1920/image_default.html")
+        template_name = "1080x1920/image_pure.html"
        # Extract dimensions from template name (e.g., "1080x1920")
        try:
            dims = template_name.split("/")[0].split("x")
@@ -492,9 +540,25 @@ Generate the video script now:"""
        
        storyboard = context.storyboard
        config = context.config
+        total_frames = len(storyboard.frames)
+        
+        # Progress range: 30% - 85% for frame production
+        base_progress = 0.30
+        progress_range = 0.55  # 85% - 30%
        
        for i, frame in enumerate(storyboard.frames, 1):
-            logger.info(f"Producing scene {i}/{len(storyboard.frames)}...")
+            logger.info(f"Producing scene {i}/{total_frames}...")
+            
+            # Emit progress for this frame (each frame has 4 steps: audio, combine, duration, compose)
+            frame_progress = base_progress + (i - 1) / total_frames * progress_range
+            self._emit_progress(ProgressEvent(
+                event_type="frame_step",
+                progress=frame_progress,
+                frame_current=i,
+                frame_total=total_frames,
+                step=1,
+                action="audio"
+            ))
            
            # Get scene data with narrations
            scene = frame._scene_data
@@ -524,6 +588,17 @@ Generate the video script now:"""
            if len(narration_audios) > 1:
                from pixelle_video.utils.os_util import get_task_frame_path
                
+                # Emit progress for combining audio
+                frame_progress = base_progress + ((i - 1) + 0.25) / total_frames * progress_range
+                self._emit_progress(ProgressEvent(
+                    event_type="frame_step",
+                    progress=frame_progress,
+                    frame_current=i,
+                    frame_total=total_frames,
+                    step=2,
+                    action="audio"
+                ))
+                
                combined_audio_path = Path(context.task_dir) / "frames" / f"{i:02d}_audio.mp3"
                
                # Use FFmpeg to concatenate audio files
@@ -564,6 +639,17 @@ Generate the video script now:"""
            # Since we already have the audio and image, we bypass some steps
            # by manually calling the composition steps
            
+            # Emit progress for duration calculation
+            frame_progress = base_progress + ((i - 1) + 0.5) / total_frames * progress_range
+            self._emit_progress(ProgressEvent(
+                event_type="frame_step",
+                progress=frame_progress,
+                frame_current=i,
+                frame_total=total_frames,
+                step=3,
+                action="compose"
+            ))
+            
            # Get audio duration for frame duration
            import subprocess
            duration_cmd = [
@@ -576,16 +662,35 @@ Generate the video script now:"""
            duration_result = subprocess.run(duration_cmd, capture_output=True, text=True, check=True)
            frame.duration = float(duration_result.stdout.strip())
            
+            # Emit progress for video composition
+            frame_progress = base_progress + ((i - 1) + 0.75) / total_frames * progress_range
+            self._emit_progress(ProgressEvent(
+                event_type="frame_step",
+                progress=frame_progress,
+                frame_current=i,
+                frame_total=total_frames,
+                step=4,
+                action="video"
+            ))
+            
            # Use FrameProcessor for proper composition
            processed_frame = await self.core.frame_processor(
                frame=frame,
                storyboard=storyboard,
                config=config,
-                total_frames=len(storyboard.frames)
+                total_frames=total_frames
            )
            
            logger.success(f"✅ Scene {i} complete")
        
+        # Emit completion of frame production
+        self._emit_progress(ProgressEvent(
+            event_type="processing_frame",
+            progress=0.85,
+            frame_current=total_frames,
+            frame_total=total_frames
+        ))
+        
        return context
    
    async def post_production(self, context: PipelineContext) -> PipelineContext:
@@ -600,6 +705,12 @@ Generate the video script now:"""
        """
        logger.info("🎞️ Concatenating scenes...")
        
+        # Emit progress for concatenation (85% - 95%)
+        self._emit_progress(ProgressEvent(
+            event_type="concatenating",
+            progress=0.86
+        ))
+        
        # Collect video segments from storyboard frames
        scene_videos = [frame.video_segment_path for frame in context.storyboard.frames]
        
@@ -626,6 +737,13 @@ Generate the video script now:"""
        
        logger.success(f"✅ Final video: {final_video_path}")
        
+        # Emit completion of concatenation
+        self._emit_progress(ProgressEvent(
+            event_type="concatenating",
+            progress=0.95,
+            extra_info="complete"
+        ))
+        
        return context
    
    async def finalize(self, context: PipelineContext) -> PipelineContext:
@@ -641,8 +759,84 @@ Generate the video script now:"""
        logger.success(f"🎉 Asset-based video generation complete!")
        logger.info(f"Video: {context.final_video_path}")
        
+        # Emit completion
+        self._emit_progress(ProgressEvent(
+            event_type="completed",
+            progress=1.0
+        ))
+        
+        # Persist metadata for history tracking
+        await self._persist_task_data(context)
+        
        return context
    
+    async def _persist_task_data(self, ctx: PipelineContext):
+        """
+        Persist task metadata and storyboard to filesystem for history tracking
+        """
+        from pathlib import Path
+        
+        try:
+            storyboard = ctx.storyboard
+            task_id = ctx.task_id
+            
+            if not task_id:
+                logger.warning("No task_id in context, skipping persistence")
+                return
+            
+            # Get file size
+            video_path_obj = Path(ctx.final_video_path)
+            file_size = video_path_obj.stat().st_size if video_path_obj.exists() else 0
+            
+            # Build metadata
+            input_params = {
+                "text": ctx.input_text,
+                "mode": "asset_based",
+                "title": ctx.title or "",
+                "n_scenes": len(storyboard.frames) if storyboard else 0,
+                "assets": ctx.request.get("assets", []),
+                "intent": ctx.request.get("intent"),
+                "duration": ctx.request.get("duration"),
+                "source": ctx.request.get("source"),
+                "voice_id": ctx.request.get("voice_id"),
+                "tts_speed": ctx.request.get("tts_speed"),
+            }
+            
+            metadata = {
+                "task_id": task_id,
+                "created_at": storyboard.created_at.isoformat() if storyboard and storyboard.created_at else None,
+                "completed_at": storyboard.completed_at.isoformat() if storyboard and storyboard.completed_at else None,
+                "status": "completed",
+                
+                "input": input_params,
+                
+                "result": {
+                    "video_path": ctx.final_video_path,
+                    "duration": storyboard.total_duration if storyboard else 0,
+                    "file_size": file_size,
+                    "n_frames": len(storyboard.frames) if storyboard else 0
+                },
+                
+                "config": {
+                    "llm_model": self.core.config.get("llm", {}).get("model", "unknown"),
+                    "llm_base_url": self.core.config.get("llm", {}).get("base_url", "unknown"),
+                    "source": ctx.request.get("source", "runninghub"),
+                }
+            }
+            
+            # Save metadata
+            await self.core.persistence.save_task_metadata(task_id, metadata)
+            logger.info(f"💾 Saved task metadata: {task_id}")
+            
+            # Save storyboard
+            if storyboard:
+                await self.core.persistence.save_storyboard(task_id, storyboard)
+                logger.info(f"💾 Saved storyboard: {task_id}")
+            
+        except Exception as e:
+            logger.error(f"Failed to persist task data: {e}")
+            # Don't raise - persistence failure shouldn't break video generation
+    
    # Helper methods
    
    def _get_asset_type(self, path: Path) -> str: