支持基于图片素材的视频生成逻辑

This commit is contained in:
puke
2025-12-03 20:11:32 +08:00
parent 6e99612a68
commit ea784e0d06
9 changed files with 1180 additions and 40 deletions

View File

@@ -73,23 +73,29 @@ class FrameProcessor:
frame_num = frame.index + 1
# Determine if this frame needs image generation
needs_image = frame.image_prompt is not None
# If image_path is already set (e.g. asset-based pipeline), we consider it "needs image" but skip generation
has_existing_image = frame.image_path is not None
needs_generation = frame.image_prompt is not None
try:
# Step 1: Generate audio (TTS)
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.0,
frame_current=frame_num,
frame_total=total_frames,
step=1,
action="audio"
))
await self._step_generate_audio(frame, config)
if not frame.audio_path:
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.0,
frame_current=frame_num,
frame_total=total_frames,
step=1,
action="audio"
))
await self._step_generate_audio(frame, config)
else:
logger.debug(f" 1/4: Using existing audio: {frame.audio_path}")
# Step 2: Generate media (image or video, conditional)
if needs_image:
# Step 2: Generate media (image or video, conditional)
if needs_generation:
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
@@ -100,16 +106,18 @@ class FrameProcessor:
action="media"
))
await self._step_generate_media(frame, config)
elif has_existing_image:
logger.debug(f" 2/4: Using existing image: {frame.image_path}")
else:
frame.image_path = None
frame.media_type = None
logger.debug(f" 2/4: Skipped media generation (not required by template)")
# Step 3: Compose frame (add subtitle)
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.50 if needs_image else 0.33,
progress=0.50 if (needs_generation or has_existing_image) else 0.33,
frame_current=frame_num,
frame_total=total_frames,
step=3,
@@ -121,17 +129,18 @@ class FrameProcessor:
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.75 if needs_image else 0.67,
progress=0.75 if (needs_generation or has_existing_image) else 0.67,
frame_current=frame_num,
frame_total=total_frames,
step=4,
action="video"
))
await self._step_create_video_segment(frame, config)
logger.info(f"✅ Frame {frame.index} completed")
return frame
except Exception as e:
logger.error(f"❌ Failed to process frame {frame.index}: {e}")
raise
@@ -303,6 +312,9 @@ class FrameProcessor:
# Generate frame using HTML (size is auto-parsed from template path)
generator = HTMLFrameGenerator(template_path)
logger.debug(f"Generating frame with image: '{frame.image_path}' (type: {type(frame.image_path)})")
composed_path = await generator.generate_frame(
title=storyboard.title,
text=frame.narration,

View File

@@ -0,0 +1,197 @@
# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image Analysis Service - ComfyUI Workflow-based implementation
Uses Florence-2 or other vision models to analyze images and generate descriptions.
"""
from typing import Optional, Literal
from pathlib import Path
from comfykit import ComfyKit
from loguru import logger
from pixelle_video.services.comfy_base_service import ComfyBaseService
class ImageAnalysisService(ComfyBaseService):
"""
Image analysis service - Workflow-based
Uses ComfyKit to execute image analysis workflows (e.g., Florence-2, BLIP, etc.).
Returns detailed textual descriptions of images.
Convention: workflows follow {source}/analyse_image.json pattern
- runninghub/analyse_image.json (default, cloud-based)
- selfhost/analyse_image.json (local ComfyUI)
Usage:
# Use default (runninghub cloud)
description = await pixelle_video.image_analysis("path/to/image.jpg")
# Use local ComfyUI
description = await pixelle_video.image_analysis(
"path/to/image.jpg",
source="selfhost"
)
# List available workflows
workflows = pixelle_video.image_analysis.list_workflows()
"""
WORKFLOW_PREFIX = "analyse_"
WORKFLOWS_DIR = "workflows"
def __init__(self, config: dict, core=None):
"""
Initialize image analysis service
Args:
config: Full application config dict
core: PixelleVideoCore instance (for accessing shared ComfyKit)
"""
super().__init__(config, service_name="image_analysis", core=core)
async def __call__(
self,
image_path: str,
# Workflow source selection
source: Literal['runninghub', 'selfhost'] = 'runninghub',
workflow: Optional[str] = None,
# ComfyUI connection (optional overrides)
comfyui_url: Optional[str] = None,
runninghub_api_key: Optional[str] = None,
# Additional workflow parameters
**params
) -> str:
"""
Analyze an image using workflow
Args:
image_path: Path to the image file (local or URL)
source: Workflow source - 'runninghub' (cloud, default) or 'selfhost' (local ComfyUI)
workflow: Workflow filename (optional, overrides source-based resolution)
comfyui_url: ComfyUI URL (optional, overrides config)
runninghub_api_key: RunningHub API key (optional, overrides config)
**params: Additional workflow parameters
Returns:
str: Text description of the image
Examples:
# Simplest: use default (runninghub cloud)
description = await pixelle_video.image_analysis("temp/06.JPG")
# Use local ComfyUI
description = await pixelle_video.image_analysis(
"temp/06.JPG",
source="selfhost"
)
# Use specific workflow (bypass source-based resolution)
description = await pixelle_video.image_analysis(
"temp/06.JPG",
workflow="selfhost/custom_analysis.json"
)
"""
from pixelle_video.utils.workflow_util import resolve_workflow_path
# 1. Validate image path
image_path_obj = Path(image_path)
if not image_path_obj.exists():
raise FileNotFoundError(f"Image file not found: {image_path}")
# 2. Resolve workflow path using convention
if workflow is None:
# Use standardized naming: {source}/analyse_image.json
workflow = resolve_workflow_path("analyse_image", source)
logger.info(f"Using {source} workflow: {workflow}")
# 2. Resolve workflow (returns structured info)
workflow_info = self._resolve_workflow(workflow=workflow)
# 3. Build workflow parameters
workflow_params = {
"image": str(image_path) # Pass image path to workflow
}
# Add any additional parameters
workflow_params.update(params)
logger.debug(f"Workflow parameters: {workflow_params}")
# 4. Execute workflow using shared ComfyKit instance from core
try:
# Get shared ComfyKit instance (lazy initialization + config hot-reload)
kit = await self.core._get_or_create_comfykit()
# Determine what to pass to ComfyKit based on source
if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
# RunningHub: pass workflow_id
workflow_input = workflow_info["workflow_id"]
logger.info(f"Executing RunningHub workflow: {workflow_input}")
else:
# Selfhost: pass file path
workflow_input = workflow_info["path"]
logger.info(f"Executing selfhost workflow: {workflow_input}")
result = await kit.execute(workflow_input, workflow_params)
# 5. Extract description from result
if result.status != "completed":
error_msg = result.msg or "Unknown error"
logger.error(f"Image analysis failed: {error_msg}")
raise Exception(f"Image analysis failed: {error_msg}")
# Extract text description from result (format varies by source)
description = None
# Try format 1: Selfhost outputs (direct text in outputs)
# Format: {'6': {'text': ['description text']}}
if result.outputs:
for node_id, node_output in result.outputs.items():
if 'text' in node_output:
text_list = node_output['text']
if text_list and len(text_list) > 0:
description = text_list[0]
break
# Try format 2: RunningHub raw_data (text file URL)
# Format: {'raw_data': [{'fileUrl': 'https://...txt', 'fileType': 'txt', ...}]}
if not description and result.outputs and 'raw_data' in result.outputs:
raw_data = result.outputs['raw_data']
if raw_data and len(raw_data) > 0:
# Find text file entry
for item in raw_data:
if item.get('fileType') == 'txt' and 'fileUrl' in item:
# Download text content from URL
import aiohttp
async with aiohttp.ClientSession() as session:
async with session.get(item['fileUrl']) as resp:
if resp.status == 200:
description = await resp.text()
description = description.strip()
break
if not description:
logger.error(f"No text found in outputs: {result.outputs}")
raise Exception("No description generated")
logger.info(f"✅ Image analyzed: {description[:100]}...")
return description
except Exception as e:
logger.error(f"Image analysis error: {e}")
raise

View File

@@ -12,15 +12,22 @@
"""
LLM (Large Language Model) Service - Direct OpenAI SDK implementation
Supports structured output via response_type parameter (Pydantic model).
"""
import os
from typing import Optional
import json
import re
from typing import Optional, Type, TypeVar, Union
from openai import AsyncOpenAI
from pydantic import BaseModel
from loguru import logger
T = TypeVar("T", bound=BaseModel)
class LLMService:
"""
LLM (Large Language Model) service
@@ -114,8 +121,9 @@ class LLMService:
model: Optional[str] = None,
temperature: float = 0.7,
max_tokens: int = 2000,
response_type: Optional[Type[T]] = None,
**kwargs
) -> str:
) -> Union[str, T]:
"""
Generate text using LLM
@@ -126,24 +134,28 @@ class LLMService:
model: Model name (optional, uses config if not provided)
temperature: Sampling temperature (0.0-2.0). Lower is more deterministic.
max_tokens: Maximum tokens to generate
response_type: Optional Pydantic model class for structured output.
If provided, returns parsed model instance instead of string.
**kwargs: Additional provider-specific parameters
Returns:
Generated text
Generated text (str) or parsed Pydantic model instance (if response_type provided)
Examples:
# Use config from config.yaml
# Basic text generation
answer = await pixelle_video.llm("Explain atomic habits")
# Override with custom parameters
answer = await pixelle_video.llm(
prompt="Explain atomic habits in 3 sentences",
api_key="sk-custom-key",
base_url="https://api.custom.com/v1",
model="custom-model",
temperature=0.7,
max_tokens=500
# Structured output with Pydantic model
class MovieReview(BaseModel):
title: str
rating: int
summary: str
review = await pixelle_video.llm(
prompt="Review the movie Inception",
response_type=MovieReview
)
print(review.title) # Structured access
"""
# Create client (new instance each time to support parameter overrides)
client = self._create_client(api_key=api_key, base_url=base_url)
@@ -155,25 +167,143 @@ class LLMService:
or "gpt-3.5-turbo" # Default fallback
)
logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}")
logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}, response_type={response_type}")
try:
response = await client.chat.completions.create(
model=final_model,
if response_type is not None:
# Structured output mode - try beta.chat.completions.parse first
return await self._call_with_structured_output(
client=client,
model=final_model,
prompt=prompt,
response_type=response_type,
temperature=temperature,
max_tokens=max_tokens,
**kwargs
)
else:
# Standard text output mode
response = await client.chat.completions.create(
model=final_model,
messages=[{"role": "user", "content": prompt}],
temperature=temperature,
max_tokens=max_tokens,
**kwargs
)
result = response.choices[0].message.content
logger.debug(f"LLM response length: {len(result)} chars")
return result
except Exception as e:
logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
raise
async def _call_with_structured_output(
self,
client: AsyncOpenAI,
model: str,
prompt: str,
response_type: Type[T],
temperature: float,
max_tokens: int,
**kwargs
) -> T:
"""
Call LLM with structured output support
Tries OpenAI beta.chat.completions.parse first, falls back to JSON parsing
if the provider doesn't support structured outputs.
Args:
client: OpenAI client
model: Model name
prompt: The prompt
response_type: Pydantic model class
temperature: Sampling temperature
max_tokens: Max tokens
**kwargs: Additional parameters
Returns:
Parsed Pydantic model instance
"""
# Try OpenAI structured output API first (beta.chat.completions.parse)
try:
response = await client.beta.chat.completions.parse(
model=model,
messages=[{"role": "user", "content": prompt}],
response_format=response_type,
temperature=temperature,
max_tokens=max_tokens,
**kwargs
)
result = response.choices[0].message.content
logger.debug(f"LLM response length: {len(result)} chars")
parsed = response.choices[0].message.parsed
if parsed is not None:
logger.debug(f"Structured output parsed successfully via beta API")
return parsed
# If parsed is None, fall through to fallback
logger.warning("Structured output API returned None, falling back to JSON parsing")
content = response.choices[0].message.content
return result
except Exception as e:
logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
raise
# If beta API not supported, fall back to JSON mode
logger.debug(f"Structured output API not available ({e}), falling back to JSON parsing")
response = await client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=temperature,
max_tokens=max_tokens,
**kwargs
)
content = response.choices[0].message.content
# Fallback: Parse JSON from response content
return self._parse_response_as_model(content, response_type)
def _parse_response_as_model(self, content: str, response_type: Type[T]) -> T:
"""
Parse LLM response content as Pydantic model
Args:
content: Raw LLM response text
response_type: Target Pydantic model class
Returns:
Parsed model instance
"""
# Try direct JSON parsing first
try:
data = json.loads(content)
return response_type.model_validate(data)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
json_pattern = r'```(?:json)?\s*([\s\S]+?)\s*```'
match = re.search(json_pattern, content, re.DOTALL)
if match:
try:
data = json.loads(match.group(1))
return response_type.model_validate(data)
except json.JSONDecodeError:
pass
# Try to find any JSON object in the text
brace_start = content.find('{')
brace_end = content.rfind('}')
if brace_start != -1 and brace_end > brace_start:
try:
json_str = content[brace_start:brace_end + 1]
data = json.loads(json_str)
return response_type.model_validate(data)
except json.JSONDecodeError:
pass
raise ValueError(f"Failed to parse LLM response as {response_type.__name__}: {content[:200]}...")
@property
def active(self) -> str: