开发基于视频素材生成视频的webui功能

2025-12-04 15:23:13 +08:00
parent 5c52696e6f
commit 007a39c03a
6 changed files with 172 additions and 79 deletions
--- a/pixelle_video/pipelines/asset_based.py
+++ b/pixelle_video/pipelines/asset_based.py
@@ -472,7 +472,9 @@ Generate the video script now:"""
        context.narrations = all_narrations
        
        # Get template dimensions
-        template_name = "1080x1920/image_pure.html"
+        # Use asset_default.html template which supports both image and video assets
+        # (conditionally shows background image or provides transparent overlay)
+        template_name = "1080x1920/asset_default.html"
        # Extract dimensions from template name (e.g., "1080x1920")
        try:
            dims = template_name.split("/")[0].split("x")
@@ -524,9 +526,20 @@ Generate the video script now:"""
                created_at=datetime.now()
            )
            
-            # Store matched asset path in the frame
-            frame.image_path = scene["matched_asset"]
-            frame.media_type = "image"
+            # Get asset path and determine actual media type from asset_index
+            asset_path = scene["matched_asset"]
+            asset_metadata = self.asset_index.get(asset_path, {})
+            asset_type = asset_metadata.get("type", "image")  # Default to image if not found
+            
+            # Set media type and path based on actual asset type
+            if asset_type == "video":
+                frame.media_type = "video"
+                frame.video_path = asset_path
+                logger.debug(f"Scene {i}: Using video asset: {Path(asset_path).name}")
+            else:
+                frame.media_type = "image"
+                frame.image_path = asset_path
+                logger.debug(f"Scene {i}: Using image asset: {Path(asset_path).name}")
            
            # Store scene info for later audio generation
            frame._scene_data = scene  # Temporary storage for multi-narration
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -73,8 +73,8 @@ class FrameProcessor:
        frame_num = frame.index + 1
        
        # Determine if this frame needs image generation
-        # If image_path is already set (e.g. asset-based pipeline), we consider it "needs image" but skip generation
-        has_existing_image = frame.image_path is not None
+        # If image_path or video_path is already set (e.g. asset-based pipeline), we consider it "has existing media" but skip generation
+        has_existing_media = frame.image_path is not None or frame.video_path is not None
        needs_generation = frame.image_prompt is not None
        
        try:
@@ -93,7 +93,6 @@ class FrameProcessor:
            else:
                logger.debug(f"  1/4: Using existing audio: {frame.audio_path}")
            
-            # Step 2: Generate media (image or video, conditional)
            # Step 2: Generate media (image or video, conditional)
            if needs_generation:
                if progress_callback:
@@ -106,8 +105,12 @@ class FrameProcessor:
                        action="media"
                    ))
                await self._step_generate_media(frame, config)
-            elif has_existing_image:
-                logger.debug(f"  2/4: Using existing image: {frame.image_path}")
+            elif has_existing_media:
+                # Log appropriate message based on media type
+                if frame.video_path:
+                    logger.debug(f"  2/4: Using existing video: {frame.video_path}")
+                else:
+                    logger.debug(f"  2/4: Using existing image: {frame.image_path}")
            else:
                frame.image_path = None
                frame.media_type = None
@@ -117,7 +120,7 @@ class FrameProcessor:
            if progress_callback:
                progress_callback(ProgressEvent(
                    event_type="frame_step",
-                    progress=0.50 if (needs_generation or has_existing_image) else 0.33,
+                    progress=0.50 if (needs_generation or has_existing_media) else 0.33,
                    frame_current=frame_num,
                    frame_total=total_frames,
                    step=3,
@@ -129,7 +132,7 @@ class FrameProcessor:
            if progress_callback:
                progress_callback(ProgressEvent(
                    event_type="frame_step",
-                    progress=0.75 if (needs_generation or has_existing_image) else 0.67,
+                    progress=0.75 if (needs_generation or has_existing_media) else 0.67,
                    frame_current=frame_num,
                    frame_total=total_frames,
                    step=4,
@@ -313,12 +316,14 @@ class FrameProcessor:
        # Generate frame using HTML (size is auto-parsed from template path)
        generator = HTMLFrameGenerator(template_path)
        
-        logger.debug(f"Generating frame with image: '{frame.image_path}' (type: {type(frame.image_path)})")
+        # Use video_path for video media, image_path for images
+        media_path = frame.video_path if frame.media_type == "video" else frame.image_path
+        logger.debug(f"Generating frame with media: '{media_path}' (type: {frame.media_type})")
        
        composed_path = await generator.generate_frame(
            title=storyboard.title,
            text=frame.narration,
-            image=frame.image_path,
+            image=media_path,  # HTMLFrameGenerator handles both image and video paths
            ext=ext,
            output_path=output_path
        )
@@ -372,7 +377,8 @@ class FrameProcessor:
                os.unlink(temp_video_with_overlay)
        
        elif frame.media_type == "image" or frame.media_type is None:
-            # Image workflow: create video from image + audio
+            # Image workflow: Use composed image directly
+            # The asset_default.html template includes the image in the composition
            logger.debug(f"  → Using image-based composition")
            
            segment_path = video_service.create_video_from_image(
--- a/pixelle_video/services/video_analysis.py
+++ b/pixelle_video/services/video_analysis.py
@@ -32,9 +32,9 @@ class VideoAnalysisService(ComfyBaseService):
    Uses ComfyKit to execute video understanding workflows.
    Returns detailed textual descriptions of video content.
    
-    Convention: workflows follow {source}/video_understanding.json pattern
-    - runninghub/video_understanding.json (default, cloud-based)
-    - selfhost/video_understanding.json (local ComfyUI, future)
+    Convention: workflows follow {source}/analyse_video.json pattern
+    - runninghub/analyse_video.json (default, cloud-based)
+    - selfhost/analyse_video.json (local ComfyUI, future)
    
    Usage:
        # Use default (runninghub cloud)
@@ -50,7 +50,7 @@ class VideoAnalysisService(ComfyBaseService):
        workflows = pixelle_video.video_analysis.list_workflows()
    """
    
-    WORKFLOW_PREFIX = "video_understanding"
+    WORKFLOW_PREFIX = "analyse_video"
    WORKFLOWS_DIR = "workflows"
    
    def __init__(self, config: dict, core=None):
@@ -114,8 +114,8 @@ class VideoAnalysisService(ComfyBaseService):
        
        # 2. Resolve workflow path using convention
        if workflow is None:
-            # Use standardized naming: {source}/video_understanding.json
-            workflow = resolve_workflow_path("video_understanding", source)
+            # Use standardized naming: {source}/analyse_video.json
+            workflow = resolve_workflow_path("analyse_video", source)
            logger.info(f"Using {source} workflow: {workflow}")
        
        # 3. Resolve workflow (returns structured info)
--- a/templates/1080x1920/asset_default.html
+++ b/templates/1080x1920/asset_default.html
@@ -1,5 +1,6 @@
 <!DOCTYPE html>
 <html>
+
 <head>
    <meta charset="UTF-8">
    <meta name="template:media-width" content="1024">
@@ -9,17 +10,16 @@
            margin: 0;
            padding: 0;
        }
-        
+
        body {
            margin: 0;
            padding: 0;
            width: 1080px;
            height: 1920px;
            font-family: 'PingFang SC', 'Source Han Sans', 'Microsoft YaHei', sans-serif;
-            background: #000;
            overflow: hidden;
        }
-        
+
        .page-container {
            width: 1080px;
            height: 1920px;
@@ -27,7 +27,10 @@
            overflow: hidden;
        }

-        /* 1. Background Image Layer (垫底图片) */
+        /* 1. Background Media Layer (背景媒体层) 
+           - For image assets: displays the image
+           - For video assets: hidden (video is composited in later step)
+        */
        .background-layer {
            position: absolute;
            top: 0;
@@ -44,10 +47,15 @@
            display: block;
        }

+        /* Hide background layer when no image (video mode) */
+        .background-layer:empty {
+            display: none;
+        }
+
        /* 2. Gradient Overlay (渐变遮罩) 
-           Ensures text readability regardless of image brightness
+           Ensures text readability regardless of background brightness
           Top: Darker for Title
-           Middle: Transparent for Image visibility
+           Middle: Transparent for Media visibility
           Bottom: Darker for Subtitles
        */
        .gradient-overlay {
@@ -57,13 +65,11 @@
            width: 100%;
            height: 100%;
            z-index: 1;
-            background: linear-gradient(
-                to bottom,
-                rgba(0,0,0,0.6) 0%,
-                rgba(0,0,0,0.1) 25%,
-                rgba(0,0,0,0.1) 60%,
-                rgba(0,0,0,0.8) 100%
-            );
+            background: linear-gradient(to bottom,
+                    rgba(0, 0, 0, 0.6) 0%,
+                    rgba(0, 0, 0, 0.1) 25%,
+                    rgba(0, 0, 0, 0.1) 60%,
+                    rgba(0, 0, 0, 0.8) 100%);
        }

        /* 3. Content Layer (内容层) */
@@ -72,7 +78,8 @@
            z-index: 2;
            width: 100%;
            height: 100%;
-            padding: 120px 80px 0px 80px; /* Top, Right, Bottom, Left */
+            padding: 120px 80px 0px 80px;
+            /* Top, Right, Bottom, Left */
            box-sizing: border-box;
            display: flex;
            flex-direction: column;
@@ -85,7 +92,7 @@
            font-size: 80px;
            font-weight: 700;
            line-height: 1.2;
-            text-shadow: 0 4px 12px rgba(0,0,0,0.5);
+            text-shadow: 0 4px 12px rgba(0, 0, 0, 0.5);
            margin-bottom: 40px;
            text-align: center;
        }
@@ -110,16 +117,20 @@
            font-weight: 500;
            line-height: 1.6;
            text-align: center;
-            text-shadow: 0 2px 8px rgba(0,0,0,0.6);
+            text-shadow: 0 2px 8px rgba(0, 0, 0, 0.6);
            backdrop-filter: blur(4px);
        }
    </style>
 </head>
+
 <body>
    <div class="page-container">
-        <!-- Background Image -->
-        <div class="background-layer">
-            <img src="{{image}}" alt="Background">
+        <!-- Background Media Layer 
+             - For image assets: contains <img> tag
+             - For video assets: empty (hidden by CSS)
+        -->
+        <div class="background-layer" id="bg-layer">
+            <!-- Image will be inserted here for image assets only -->
        </div>

        <!-- Shadow Overlay for Text Readability -->
@@ -141,5 +152,23 @@
            </div>
        </div>
    </div>
+
+    <script>
+        // Conditionally add image if provided
+        (function () {
+            var imageUrl = "{{image}}";
+            var bgLayer = document.getElementById('bg-layer');
+
+            // Only add img tag if image URL is provided and not empty
+            if (imageUrl && imageUrl.trim() !== "" && imageUrl !== "None") {
+                var img = document.createElement('img');
+                img.src = imageUrl;
+                img.alt = "Background";
+                bgLayer.appendChild(img);
+            }
+            // Otherwise, bg-layer stays empty and gets hidden by CSS
+        })();
+    </script>
 </body>
+
 </html>
--- a/workflows/selfhost/analyse_image.json
+++ b/workflows/selfhost/analyse_image.json
@@ -1,41 +1,4 @@
 {
-  "3": {
-    "inputs": {
-      "model": "microsoft/Florence-2-large",
-      "precision": "fp16",
-      "attention": "sdpa",
-      "convert_to_safetensors": false
-    },
-    "class_type": "DownloadAndLoadFlorence2Model",
-    "_meta": {
-      "title": "DownloadAndLoadFlorence2Model"
-    }
-  },
-  "4": {
-    "inputs": {
-      "text_input": "",
-      "task": "more_detailed_caption",
-      "fill_mask": true,
-      "keep_model_loaded": false,
-      "max_new_tokens": 1024,
-      "num_beams": 3,
-      "do_sample": true,
-      "output_mask_select": "",
-      "seed": 853848678279928,
-      "image": [
-        "5",
-        0
-      ],
-      "florence2_model": [
-        "3",
-        0
-      ]
-    },
-    "class_type": "Florence2Run",
-    "_meta": {
-      "title": "Florence2Run"
-    }
-  },
  "5": {
    "inputs": {
      "image": "06.JPG"
@@ -47,15 +10,34 @@
  },
  "6": {
    "inputs": {
-      "text": "The image shows a white cat sitting on a black and white striped stool against a white wall. The cat is wearing a blue knitted sweater and is looking directly at the camera with a curious expression. Its ears are perked up and its eyes are wide open, giving it an alert and inquisitive look. The background is plain white, making the cat the focal point of the image.",
      "anything": [
-        "4",
-        2
+        "7",
+        0
      ]
    },
    "class_type": "easy showAnything",
    "_meta": {
      "title": "Show Any"
    }
+  },
+  "7": {
+    "inputs": {
+      "model_name": "Qwen3-VL-8B-Instruct",
+      "quantization": "None (FP16)",
+      "attention_mode": "auto",
+      "preset_prompt": "🖼️ Detailed Description",
+      "custom_prompt": "",
+      "max_tokens": 512,
+      "keep_model_loaded": true,
+      "seed": 1,
+      "image": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "AILab_QwenVL",
+    "_meta": {
+      "title": "QwenVL"
+    }
  }
 }
--- a/workflows/selfhost/analyse_video.json
+++ b/workflows/selfhost/analyse_video.json
@@ -0,0 +1,63 @@
+{
+  "9": {
+    "inputs": {
+      "prompt": "详细描述这个视频，500字以内"
+    },
+    "class_type": "CR Prompt Text",
+    "_meta": {
+      "title": "⚙️ CR Prompt Text"
+    }
+  },
+  "14": {
+    "inputs": {
+      "video": "01_segment.mp4",
+      "force_rate": 0,
+      "custom_width": 0,
+      "custom_height": 0,
+      "frame_load_cap": 0,
+      "skip_first_frames": 0,
+      "select_every_nth": 1,
+      "format": "AnimateDiff"
+    },
+    "class_type": "VHS_LoadVideo",
+    "_meta": {
+      "title": "$video.video"
+    }
+  },
+  "18": {
+    "inputs": {
+      "text": "这是一个静态插画风格的宣传图或信息图表。画面中央是一位坐在办公桌前、面带微笑的年轻人，他正专注地使用笔记本电脑工作。他的身后是一扇大窗户，窗外透进温暖柔和的光线，营造出温馨舒适的居家氛围；窗边挂着浅色窗帘，墙上悬挂着一个黑色圆形时钟，指针指向约六点十分——暗示下班后的时间段。\n\n从年轻人头顶上方飘落许多金色硬币，每枚都印有美元符号“$”，象征被动收入源源不断流入。桌子两侧各摆放一盆高大的绿植（叶片宽大），增添自然气息与生活感。桌面简洁整洁，仅放一台银灰色笔记本和一个小花瓶作为装饰。\n\n图片顶部用粗体黑字写着标题：“如何新增被动收入”。底部则有一句引言式文案：“下班后时间其实能创造新收入”，强调利用业余时间实现财务自由的可能性。左下角标注了创作者信息：@Pixelle.AI，并注明其为开源多模态AI创意代理工具；右下角显示作品类型“Pixelle-Video”。\n\n整体色调以米黄、灰棕为主，配以橙金点缀，视觉上既专业又不失亲和力，适合用于财经类内容推广或个人理财教育场景。构图平衡对称，突出主题的同时传递积极向上的价值观——即通过智慧投资时间和技能，在非正式工时也能收获财富回报。",
+      "anything": [
+        "19",
+        0
+      ]
+    },
+    "class_type": "easy showAnything",
+    "_meta": {
+      "title": "Show Any"
+    }
+  },
+  "19": {
+    "inputs": {
+      "model_name": "Qwen3-VL-8B-Instruct",
+      "quantization": "None (FP16)",
+      "attention_mode": "auto",
+      "preset_prompt": "📹 Video Summary",
+      "custom_prompt": [
+        "9",
+        0
+      ],
+      "max_tokens": 1024,
+      "keep_model_loaded": true,
+      "seed": 582488656,
+      "video": [
+        "14",
+        0
+      ]
+    },
+    "class_type": "AILab_QwenVL",
+    "_meta": {
+      "title": "QwenVL"
+    }
+  }
+}