支持基于图片素材的视频生成逻辑

2025-12-03 20:11:32 +08:00
parent 6e99612a68
commit ea784e0d06
9 changed files with 1180 additions and 40 deletions
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -73,23 +73,29 @@ class FrameProcessor:
        frame_num = frame.index + 1
        
        # Determine if this frame needs image generation
-        needs_image = frame.image_prompt is not None
+        # If image_path is already set (e.g. asset-based pipeline), we consider it "needs image" but skip generation
+        has_existing_image = frame.image_path is not None
+        needs_generation = frame.image_prompt is not None
        
        try:
            # Step 1: Generate audio (TTS)
-            if progress_callback:
-                progress_callback(ProgressEvent(
-                    event_type="frame_step",
-                    progress=0.0,
-                    frame_current=frame_num,
-                    frame_total=total_frames,
-                    step=1,
-                    action="audio"
-                ))
-            await self._step_generate_audio(frame, config)
+            if not frame.audio_path:
+                if progress_callback:
+                    progress_callback(ProgressEvent(
+                        event_type="frame_step",
+                        progress=0.0,
+                        frame_current=frame_num,
+                        frame_total=total_frames,
+                        step=1,
+                        action="audio"
+                    ))
+                await self._step_generate_audio(frame, config)
+            else:
+                logger.debug(f"  1/4: Using existing audio: {frame.audio_path}")
            
            # Step 2: Generate media (image or video, conditional)
-            if needs_image:
+            # Step 2: Generate media (image or video, conditional)
+            if needs_generation:
                if progress_callback:
                    progress_callback(ProgressEvent(
                        event_type="frame_step",
@@ -100,16 +106,18 @@ class FrameProcessor:
                        action="media"
                    ))
                await self._step_generate_media(frame, config)
+            elif has_existing_image:
+                logger.debug(f"  2/4: Using existing image: {frame.image_path}")
            else:
                frame.image_path = None
                frame.media_type = None
                logger.debug(f"  2/4: Skipped media generation (not required by template)")
-            
+        
            # Step 3: Compose frame (add subtitle)
            if progress_callback:
                progress_callback(ProgressEvent(
                    event_type="frame_step",
-                    progress=0.50 if needs_image else 0.33,
+                    progress=0.50 if (needs_generation or has_existing_image) else 0.33,
                    frame_current=frame_num,
                    frame_total=total_frames,
                    step=3,
@@ -121,17 +129,18 @@ class FrameProcessor:
            if progress_callback:
                progress_callback(ProgressEvent(
                    event_type="frame_step",
-                    progress=0.75 if needs_image else 0.67,
+                    progress=0.75 if (needs_generation or has_existing_image) else 0.67,
                    frame_current=frame_num,
                    frame_total=total_frames,
                    step=4,
                    action="video"
                ))
+            
            await self._step_create_video_segment(frame, config)
            
            logger.info(f"✅ Frame {frame.index} completed")
            return frame
-            
+
        except Exception as e:
            logger.error(f"❌ Failed to process frame {frame.index}: {e}")
            raise
@@ -303,6 +312,9 @@ class FrameProcessor:
        
        # Generate frame using HTML (size is auto-parsed from template path)
        generator = HTMLFrameGenerator(template_path)
+        
+        logger.debug(f"Generating frame with image: '{frame.image_path}' (type: {type(frame.image_path)})")
+        
        composed_path = await generator.generate_frame(
            title=storyboard.title,
            text=frame.narration,
--- a/pixelle_video/services/image_analysis.py
+++ b/pixelle_video/services/image_analysis.py
@@ -0,0 +1,197 @@
+# Copyright (C) 2025 AIDC-AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Image Analysis Service - ComfyUI Workflow-based implementation
+
+Uses Florence-2 or other vision models to analyze images and generate descriptions.
+"""
+
+from typing import Optional, Literal
+from pathlib import Path
+
+from comfykit import ComfyKit
+from loguru import logger
+
+from pixelle_video.services.comfy_base_service import ComfyBaseService
+
+
+class ImageAnalysisService(ComfyBaseService):
+    """
+    Image analysis service - Workflow-based
+    
+    Uses ComfyKit to execute image analysis workflows (e.g., Florence-2, BLIP, etc.).
+    Returns detailed textual descriptions of images.
+    
+    Convention: workflows follow {source}/analyse_image.json pattern
+    - runninghub/analyse_image.json (default, cloud-based)
+    - selfhost/analyse_image.json (local ComfyUI)
+    
+    Usage:
+        # Use default (runninghub cloud)
+        description = await pixelle_video.image_analysis("path/to/image.jpg")
+        
+        # Use local ComfyUI
+        description = await pixelle_video.image_analysis(
+            "path/to/image.jpg",
+            source="selfhost"
+        )
+        
+        # List available workflows
+        workflows = pixelle_video.image_analysis.list_workflows()
+    """
+    
+    WORKFLOW_PREFIX = "analyse_"
+    WORKFLOWS_DIR = "workflows"
+    
+    def __init__(self, config: dict, core=None):
+        """
+        Initialize image analysis service
+        
+        Args:
+            config: Full application config dict
+            core: PixelleVideoCore instance (for accessing shared ComfyKit)
+        """
+        super().__init__(config, service_name="image_analysis", core=core)
+    
+    async def __call__(
+        self,
+        image_path: str,
+        # Workflow source selection
+        source: Literal['runninghub', 'selfhost'] = 'runninghub',
+        workflow: Optional[str] = None,
+        # ComfyUI connection (optional overrides)
+        comfyui_url: Optional[str] = None,
+        runninghub_api_key: Optional[str] = None,
+        # Additional workflow parameters
+        **params
+    ) -> str:
+        """
+        Analyze an image using workflow
+        
+        Args:
+            image_path: Path to the image file (local or URL)
+            source: Workflow source - 'runninghub' (cloud, default) or 'selfhost' (local ComfyUI)
+            workflow: Workflow filename (optional, overrides source-based resolution)
+            comfyui_url: ComfyUI URL (optional, overrides config)
+            runninghub_api_key: RunningHub API key (optional, overrides config)
+            **params: Additional workflow parameters
+        
+        Returns:
+            str: Text description of the image
+        
+        Examples:
+            # Simplest: use default (runninghub cloud)
+            description = await pixelle_video.image_analysis("temp/06.JPG")
+            
+            # Use local ComfyUI
+            description = await pixelle_video.image_analysis(
+                "temp/06.JPG",
+                source="selfhost"
+            )
+            
+            # Use specific workflow (bypass source-based resolution)
+            description = await pixelle_video.image_analysis(
+                "temp/06.JPG",
+                workflow="selfhost/custom_analysis.json"
+            )
+        """
+        from pixelle_video.utils.workflow_util import resolve_workflow_path
+        
+        # 1. Validate image path
+        image_path_obj = Path(image_path)
+        if not image_path_obj.exists():
+            raise FileNotFoundError(f"Image file not found: {image_path}")
+        
+        # 2. Resolve workflow path using convention
+        if workflow is None:
+            # Use standardized naming: {source}/analyse_image.json
+            workflow = resolve_workflow_path("analyse_image", source)
+            logger.info(f"Using {source} workflow: {workflow}")
+        
+        # 2. Resolve workflow (returns structured info)
+        workflow_info = self._resolve_workflow(workflow=workflow)
+        
+        # 3. Build workflow parameters
+        workflow_params = {
+            "image": str(image_path)  # Pass image path to workflow
+        }
+        
+        # Add any additional parameters
+        workflow_params.update(params)
+        
+        logger.debug(f"Workflow parameters: {workflow_params}")
+        
+        # 4. Execute workflow using shared ComfyKit instance from core
+        try:
+            # Get shared ComfyKit instance (lazy initialization + config hot-reload)
+            kit = await self.core._get_or_create_comfykit()
+            
+            # Determine what to pass to ComfyKit based on source
+            if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
+                # RunningHub: pass workflow_id
+                workflow_input = workflow_info["workflow_id"]
+                logger.info(f"Executing RunningHub workflow: {workflow_input}")
+            else:
+                # Selfhost: pass file path
+                workflow_input = workflow_info["path"]
+                logger.info(f"Executing selfhost workflow: {workflow_input}")
+            
+            result = await kit.execute(workflow_input, workflow_params)
+            
+            # 5. Extract description from result
+            if result.status != "completed":
+                error_msg = result.msg or "Unknown error"
+                logger.error(f"Image analysis failed: {error_msg}")
+                raise Exception(f"Image analysis failed: {error_msg}")
+            
+            # Extract text description from result (format varies by source)
+            description = None
+            
+            # Try format 1: Selfhost outputs (direct text in outputs)
+            # Format: {'6': {'text': ['description text']}}
+            if result.outputs:
+                for node_id, node_output in result.outputs.items():
+                    if 'text' in node_output:
+                        text_list = node_output['text']
+                        if text_list and len(text_list) > 0:
+                            description = text_list[0]
+                            break
+            
+            # Try format 2: RunningHub raw_data (text file URL)
+            # Format: {'raw_data': [{'fileUrl': 'https://...txt', 'fileType': 'txt', ...}]}
+            if not description and result.outputs and 'raw_data' in result.outputs:
+                raw_data = result.outputs['raw_data']
+                if raw_data and len(raw_data) > 0:
+                    # Find text file entry
+                    for item in raw_data:
+                        if item.get('fileType') == 'txt' and 'fileUrl' in item:
+                            # Download text content from URL
+                            import aiohttp
+                            async with aiohttp.ClientSession() as session:
+                                async with session.get(item['fileUrl']) as resp:
+                                    if resp.status == 200:
+                                        description = await resp.text()
+                                        description = description.strip()
+                                        break
+            
+            if not description:
+                logger.error(f"No text found in outputs: {result.outputs}")
+                raise Exception("No description generated")
+            
+            logger.info(f"✅ Image analyzed: {description[:100]}...")
+            
+            return description
+        
+        except Exception as e:
+            logger.error(f"Image analysis error: {e}")
+            raise
--- a/pixelle_video/services/llm_service.py
+++ b/pixelle_video/services/llm_service.py
@@ -12,15 +12,22 @@

 """
 LLM (Large Language Model) Service - Direct OpenAI SDK implementation
+
+Supports structured output via response_type parameter (Pydantic model).
 """

-import os
-from typing import Optional
+import json
+import re
+from typing import Optional, Type, TypeVar, Union

 from openai import AsyncOpenAI
+from pydantic import BaseModel
 from loguru import logger


+T = TypeVar("T", bound=BaseModel)
+
+
 class LLMService:
    """
    LLM (Large Language Model) service
@@ -114,8 +121,9 @@ class LLMService:
        model: Optional[str] = None,
        temperature: float = 0.7,
        max_tokens: int = 2000,
+        response_type: Optional[Type[T]] = None,
        **kwargs
-    ) -> str:
+    ) -> Union[str, T]:
        """
        Generate text using LLM
        
@@ -126,24 +134,28 @@ class LLMService:
            model: Model name (optional, uses config if not provided)
            temperature: Sampling temperature (0.0-2.0). Lower is more deterministic.
            max_tokens: Maximum tokens to generate
+            response_type: Optional Pydantic model class for structured output.
+                          If provided, returns parsed model instance instead of string.
            **kwargs: Additional provider-specific parameters
        
        Returns:
-            Generated text
+            Generated text (str) or parsed Pydantic model instance (if response_type provided)
        
        Examples:
-            # Use config from config.yaml
+            # Basic text generation
            answer = await pixelle_video.llm("Explain atomic habits")
            
-            # Override with custom parameters
-            answer = await pixelle_video.llm(
-                prompt="Explain atomic habits in 3 sentences",
-                api_key="sk-custom-key",
-                base_url="https://api.custom.com/v1",
-                model="custom-model",
-                temperature=0.7,
-                max_tokens=500
+            # Structured output with Pydantic model
+            class MovieReview(BaseModel):
+                title: str
+                rating: int
+                summary: str
+            
+            review = await pixelle_video.llm(
+                prompt="Review the movie Inception",
+                response_type=MovieReview
            )
+            print(review.title)  # Structured access
        """
        # Create client (new instance each time to support parameter overrides)
        client = self._create_client(api_key=api_key, base_url=base_url)
@@ -155,25 +167,143 @@ class LLMService:
            or "gpt-3.5-turbo"  # Default fallback
        )
        
-        logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}")
+        logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}, response_type={response_type}")
        
        try:
-            response = await client.chat.completions.create(
-                model=final_model,
+            if response_type is not None:
+                # Structured output mode - try beta.chat.completions.parse first
+                return await self._call_with_structured_output(
+                    client=client,
+                    model=final_model,
+                    prompt=prompt,
+                    response_type=response_type,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    **kwargs
+                )
+            else:
+                # Standard text output mode
+                response = await client.chat.completions.create(
+                    model=final_model,
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    **kwargs
+                )
+                
+                result = response.choices[0].message.content
+                logger.debug(f"LLM response length: {len(result)} chars")
+                
+                return result
+        
+        except Exception as e:
+            logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
+            raise
+    
+    async def _call_with_structured_output(
+        self,
+        client: AsyncOpenAI,
+        model: str,
+        prompt: str,
+        response_type: Type[T],
+        temperature: float,
+        max_tokens: int,
+        **kwargs
+    ) -> T:
+        """
+        Call LLM with structured output support
+        
+        Tries OpenAI beta.chat.completions.parse first, falls back to JSON parsing
+        if the provider doesn't support structured outputs.
+        
+        Args:
+            client: OpenAI client
+            model: Model name
+            prompt: The prompt
+            response_type: Pydantic model class
+            temperature: Sampling temperature
+            max_tokens: Max tokens
+            **kwargs: Additional parameters
+        
+        Returns:
+            Parsed Pydantic model instance
+        """
+        # Try OpenAI structured output API first (beta.chat.completions.parse)
+        try:
+            response = await client.beta.chat.completions.parse(
+                model=model,
                messages=[{"role": "user", "content": prompt}],
+                response_format=response_type,
                temperature=temperature,
                max_tokens=max_tokens,
                **kwargs
            )
            
-            result = response.choices[0].message.content
-            logger.debug(f"LLM response length: {len(result)} chars")
+            parsed = response.choices[0].message.parsed
+            if parsed is not None:
+                logger.debug(f"Structured output parsed successfully via beta API")
+                return parsed
+            
+            # If parsed is None, fall through to fallback
+            logger.warning("Structured output API returned None, falling back to JSON parsing")
+            content = response.choices[0].message.content
            
-            return result
-        
        except Exception as e:
-            logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
-            raise
+            # If beta API not supported, fall back to JSON mode
+            logger.debug(f"Structured output API not available ({e}), falling back to JSON parsing")
+            
+            response = await client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=temperature,
+                max_tokens=max_tokens,
+                **kwargs
+            )
+            content = response.choices[0].message.content
+        
+        # Fallback: Parse JSON from response content
+        return self._parse_response_as_model(content, response_type)
+    
+    def _parse_response_as_model(self, content: str, response_type: Type[T]) -> T:
+        """
+        Parse LLM response content as Pydantic model
+        
+        Args:
+            content: Raw LLM response text
+            response_type: Target Pydantic model class
+        
+        Returns:
+            Parsed model instance
+        """
+        # Try direct JSON parsing first
+        try:
+            data = json.loads(content)
+            return response_type.model_validate(data)
+        except json.JSONDecodeError:
+            pass
+        
+        # Try extracting from markdown code block
+        json_pattern = r'```(?:json)?\s*([\s\S]+?)\s*```'
+        match = re.search(json_pattern, content, re.DOTALL)
+        if match:
+            try:
+                data = json.loads(match.group(1))
+                return response_type.model_validate(data)
+            except json.JSONDecodeError:
+                pass
+        
+        # Try to find any JSON object in the text
+        brace_start = content.find('{')
+        brace_end = content.rfind('}')
+        if brace_start != -1 and brace_end > brace_start:
+            try:
+                json_str = content[brace_start:brace_end + 1]
+                data = json.loads(json_str)
+                return response_type.model_validate(data)
+            except json.JSONDecodeError:
+                pass
+        
+        raise ValueError(f"Failed to parse LLM response as {response_type.__name__}: {content[:200]}...")
    
    @property
    def active(self) -> str: