支持基于图片素材的视频生成逻辑
This commit is contained in:
@@ -73,23 +73,29 @@ class FrameProcessor:
|
||||
frame_num = frame.index + 1
|
||||
|
||||
# Determine if this frame needs image generation
|
||||
needs_image = frame.image_prompt is not None
|
||||
# If image_path is already set (e.g. asset-based pipeline), we consider it "needs image" but skip generation
|
||||
has_existing_image = frame.image_path is not None
|
||||
needs_generation = frame.image_prompt is not None
|
||||
|
||||
try:
|
||||
# Step 1: Generate audio (TTS)
|
||||
if progress_callback:
|
||||
progress_callback(ProgressEvent(
|
||||
event_type="frame_step",
|
||||
progress=0.0,
|
||||
frame_current=frame_num,
|
||||
frame_total=total_frames,
|
||||
step=1,
|
||||
action="audio"
|
||||
))
|
||||
await self._step_generate_audio(frame, config)
|
||||
if not frame.audio_path:
|
||||
if progress_callback:
|
||||
progress_callback(ProgressEvent(
|
||||
event_type="frame_step",
|
||||
progress=0.0,
|
||||
frame_current=frame_num,
|
||||
frame_total=total_frames,
|
||||
step=1,
|
||||
action="audio"
|
||||
))
|
||||
await self._step_generate_audio(frame, config)
|
||||
else:
|
||||
logger.debug(f" 1/4: Using existing audio: {frame.audio_path}")
|
||||
|
||||
# Step 2: Generate media (image or video, conditional)
|
||||
if needs_image:
|
||||
# Step 2: Generate media (image or video, conditional)
|
||||
if needs_generation:
|
||||
if progress_callback:
|
||||
progress_callback(ProgressEvent(
|
||||
event_type="frame_step",
|
||||
@@ -100,16 +106,18 @@ class FrameProcessor:
|
||||
action="media"
|
||||
))
|
||||
await self._step_generate_media(frame, config)
|
||||
elif has_existing_image:
|
||||
logger.debug(f" 2/4: Using existing image: {frame.image_path}")
|
||||
else:
|
||||
frame.image_path = None
|
||||
frame.media_type = None
|
||||
logger.debug(f" 2/4: Skipped media generation (not required by template)")
|
||||
|
||||
|
||||
# Step 3: Compose frame (add subtitle)
|
||||
if progress_callback:
|
||||
progress_callback(ProgressEvent(
|
||||
event_type="frame_step",
|
||||
progress=0.50 if needs_image else 0.33,
|
||||
progress=0.50 if (needs_generation or has_existing_image) else 0.33,
|
||||
frame_current=frame_num,
|
||||
frame_total=total_frames,
|
||||
step=3,
|
||||
@@ -121,17 +129,18 @@ class FrameProcessor:
|
||||
if progress_callback:
|
||||
progress_callback(ProgressEvent(
|
||||
event_type="frame_step",
|
||||
progress=0.75 if needs_image else 0.67,
|
||||
progress=0.75 if (needs_generation or has_existing_image) else 0.67,
|
||||
frame_current=frame_num,
|
||||
frame_total=total_frames,
|
||||
step=4,
|
||||
action="video"
|
||||
))
|
||||
|
||||
await self._step_create_video_segment(frame, config)
|
||||
|
||||
logger.info(f"✅ Frame {frame.index} completed")
|
||||
return frame
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to process frame {frame.index}: {e}")
|
||||
raise
|
||||
@@ -303,6 +312,9 @@ class FrameProcessor:
|
||||
|
||||
# Generate frame using HTML (size is auto-parsed from template path)
|
||||
generator = HTMLFrameGenerator(template_path)
|
||||
|
||||
logger.debug(f"Generating frame with image: '{frame.image_path}' (type: {type(frame.image_path)})")
|
||||
|
||||
composed_path = await generator.generate_frame(
|
||||
title=storyboard.title,
|
||||
text=frame.narration,
|
||||
|
||||
197
pixelle_video/services/image_analysis.py
Normal file
197
pixelle_video/services/image_analysis.py
Normal file
@@ -0,0 +1,197 @@
|
||||
# Copyright (C) 2025 AIDC-AI
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Image Analysis Service - ComfyUI Workflow-based implementation
|
||||
|
||||
Uses Florence-2 or other vision models to analyze images and generate descriptions.
|
||||
"""
|
||||
|
||||
from typing import Optional, Literal
|
||||
from pathlib import Path
|
||||
|
||||
from comfykit import ComfyKit
|
||||
from loguru import logger
|
||||
|
||||
from pixelle_video.services.comfy_base_service import ComfyBaseService
|
||||
|
||||
|
||||
class ImageAnalysisService(ComfyBaseService):
|
||||
"""
|
||||
Image analysis service - Workflow-based
|
||||
|
||||
Uses ComfyKit to execute image analysis workflows (e.g., Florence-2, BLIP, etc.).
|
||||
Returns detailed textual descriptions of images.
|
||||
|
||||
Convention: workflows follow {source}/analyse_image.json pattern
|
||||
- runninghub/analyse_image.json (default, cloud-based)
|
||||
- selfhost/analyse_image.json (local ComfyUI)
|
||||
|
||||
Usage:
|
||||
# Use default (runninghub cloud)
|
||||
description = await pixelle_video.image_analysis("path/to/image.jpg")
|
||||
|
||||
# Use local ComfyUI
|
||||
description = await pixelle_video.image_analysis(
|
||||
"path/to/image.jpg",
|
||||
source="selfhost"
|
||||
)
|
||||
|
||||
# List available workflows
|
||||
workflows = pixelle_video.image_analysis.list_workflows()
|
||||
"""
|
||||
|
||||
WORKFLOW_PREFIX = "analyse_"
|
||||
WORKFLOWS_DIR = "workflows"
|
||||
|
||||
def __init__(self, config: dict, core=None):
|
||||
"""
|
||||
Initialize image analysis service
|
||||
|
||||
Args:
|
||||
config: Full application config dict
|
||||
core: PixelleVideoCore instance (for accessing shared ComfyKit)
|
||||
"""
|
||||
super().__init__(config, service_name="image_analysis", core=core)
|
||||
|
||||
async def __call__(
|
||||
self,
|
||||
image_path: str,
|
||||
# Workflow source selection
|
||||
source: Literal['runninghub', 'selfhost'] = 'runninghub',
|
||||
workflow: Optional[str] = None,
|
||||
# ComfyUI connection (optional overrides)
|
||||
comfyui_url: Optional[str] = None,
|
||||
runninghub_api_key: Optional[str] = None,
|
||||
# Additional workflow parameters
|
||||
**params
|
||||
) -> str:
|
||||
"""
|
||||
Analyze an image using workflow
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file (local or URL)
|
||||
source: Workflow source - 'runninghub' (cloud, default) or 'selfhost' (local ComfyUI)
|
||||
workflow: Workflow filename (optional, overrides source-based resolution)
|
||||
comfyui_url: ComfyUI URL (optional, overrides config)
|
||||
runninghub_api_key: RunningHub API key (optional, overrides config)
|
||||
**params: Additional workflow parameters
|
||||
|
||||
Returns:
|
||||
str: Text description of the image
|
||||
|
||||
Examples:
|
||||
# Simplest: use default (runninghub cloud)
|
||||
description = await pixelle_video.image_analysis("temp/06.JPG")
|
||||
|
||||
# Use local ComfyUI
|
||||
description = await pixelle_video.image_analysis(
|
||||
"temp/06.JPG",
|
||||
source="selfhost"
|
||||
)
|
||||
|
||||
# Use specific workflow (bypass source-based resolution)
|
||||
description = await pixelle_video.image_analysis(
|
||||
"temp/06.JPG",
|
||||
workflow="selfhost/custom_analysis.json"
|
||||
)
|
||||
"""
|
||||
from pixelle_video.utils.workflow_util import resolve_workflow_path
|
||||
|
||||
# 1. Validate image path
|
||||
image_path_obj = Path(image_path)
|
||||
if not image_path_obj.exists():
|
||||
raise FileNotFoundError(f"Image file not found: {image_path}")
|
||||
|
||||
# 2. Resolve workflow path using convention
|
||||
if workflow is None:
|
||||
# Use standardized naming: {source}/analyse_image.json
|
||||
workflow = resolve_workflow_path("analyse_image", source)
|
||||
logger.info(f"Using {source} workflow: {workflow}")
|
||||
|
||||
# 2. Resolve workflow (returns structured info)
|
||||
workflow_info = self._resolve_workflow(workflow=workflow)
|
||||
|
||||
# 3. Build workflow parameters
|
||||
workflow_params = {
|
||||
"image": str(image_path) # Pass image path to workflow
|
||||
}
|
||||
|
||||
# Add any additional parameters
|
||||
workflow_params.update(params)
|
||||
|
||||
logger.debug(f"Workflow parameters: {workflow_params}")
|
||||
|
||||
# 4. Execute workflow using shared ComfyKit instance from core
|
||||
try:
|
||||
# Get shared ComfyKit instance (lazy initialization + config hot-reload)
|
||||
kit = await self.core._get_or_create_comfykit()
|
||||
|
||||
# Determine what to pass to ComfyKit based on source
|
||||
if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
|
||||
# RunningHub: pass workflow_id
|
||||
workflow_input = workflow_info["workflow_id"]
|
||||
logger.info(f"Executing RunningHub workflow: {workflow_input}")
|
||||
else:
|
||||
# Selfhost: pass file path
|
||||
workflow_input = workflow_info["path"]
|
||||
logger.info(f"Executing selfhost workflow: {workflow_input}")
|
||||
|
||||
result = await kit.execute(workflow_input, workflow_params)
|
||||
|
||||
# 5. Extract description from result
|
||||
if result.status != "completed":
|
||||
error_msg = result.msg or "Unknown error"
|
||||
logger.error(f"Image analysis failed: {error_msg}")
|
||||
raise Exception(f"Image analysis failed: {error_msg}")
|
||||
|
||||
# Extract text description from result (format varies by source)
|
||||
description = None
|
||||
|
||||
# Try format 1: Selfhost outputs (direct text in outputs)
|
||||
# Format: {'6': {'text': ['description text']}}
|
||||
if result.outputs:
|
||||
for node_id, node_output in result.outputs.items():
|
||||
if 'text' in node_output:
|
||||
text_list = node_output['text']
|
||||
if text_list and len(text_list) > 0:
|
||||
description = text_list[0]
|
||||
break
|
||||
|
||||
# Try format 2: RunningHub raw_data (text file URL)
|
||||
# Format: {'raw_data': [{'fileUrl': 'https://...txt', 'fileType': 'txt', ...}]}
|
||||
if not description and result.outputs and 'raw_data' in result.outputs:
|
||||
raw_data = result.outputs['raw_data']
|
||||
if raw_data and len(raw_data) > 0:
|
||||
# Find text file entry
|
||||
for item in raw_data:
|
||||
if item.get('fileType') == 'txt' and 'fileUrl' in item:
|
||||
# Download text content from URL
|
||||
import aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(item['fileUrl']) as resp:
|
||||
if resp.status == 200:
|
||||
description = await resp.text()
|
||||
description = description.strip()
|
||||
break
|
||||
|
||||
if not description:
|
||||
logger.error(f"No text found in outputs: {result.outputs}")
|
||||
raise Exception("No description generated")
|
||||
|
||||
logger.info(f"✅ Image analyzed: {description[:100]}...")
|
||||
|
||||
return description
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Image analysis error: {e}")
|
||||
raise
|
||||
@@ -12,15 +12,22 @@
|
||||
|
||||
"""
|
||||
LLM (Large Language Model) Service - Direct OpenAI SDK implementation
|
||||
|
||||
Supports structured output via response_type parameter (Pydantic model).
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
import json
|
||||
import re
|
||||
from typing import Optional, Type, TypeVar, Union
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
from pydantic import BaseModel
|
||||
from loguru import logger
|
||||
|
||||
|
||||
T = TypeVar("T", bound=BaseModel)
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""
|
||||
LLM (Large Language Model) service
|
||||
@@ -114,8 +121,9 @@ class LLMService:
|
||||
model: Optional[str] = None,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 2000,
|
||||
response_type: Optional[Type[T]] = None,
|
||||
**kwargs
|
||||
) -> str:
|
||||
) -> Union[str, T]:
|
||||
"""
|
||||
Generate text using LLM
|
||||
|
||||
@@ -126,24 +134,28 @@ class LLMService:
|
||||
model: Model name (optional, uses config if not provided)
|
||||
temperature: Sampling temperature (0.0-2.0). Lower is more deterministic.
|
||||
max_tokens: Maximum tokens to generate
|
||||
response_type: Optional Pydantic model class for structured output.
|
||||
If provided, returns parsed model instance instead of string.
|
||||
**kwargs: Additional provider-specific parameters
|
||||
|
||||
Returns:
|
||||
Generated text
|
||||
Generated text (str) or parsed Pydantic model instance (if response_type provided)
|
||||
|
||||
Examples:
|
||||
# Use config from config.yaml
|
||||
# Basic text generation
|
||||
answer = await pixelle_video.llm("Explain atomic habits")
|
||||
|
||||
# Override with custom parameters
|
||||
answer = await pixelle_video.llm(
|
||||
prompt="Explain atomic habits in 3 sentences",
|
||||
api_key="sk-custom-key",
|
||||
base_url="https://api.custom.com/v1",
|
||||
model="custom-model",
|
||||
temperature=0.7,
|
||||
max_tokens=500
|
||||
# Structured output with Pydantic model
|
||||
class MovieReview(BaseModel):
|
||||
title: str
|
||||
rating: int
|
||||
summary: str
|
||||
|
||||
review = await pixelle_video.llm(
|
||||
prompt="Review the movie Inception",
|
||||
response_type=MovieReview
|
||||
)
|
||||
print(review.title) # Structured access
|
||||
"""
|
||||
# Create client (new instance each time to support parameter overrides)
|
||||
client = self._create_client(api_key=api_key, base_url=base_url)
|
||||
@@ -155,25 +167,143 @@ class LLMService:
|
||||
or "gpt-3.5-turbo" # Default fallback
|
||||
)
|
||||
|
||||
logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}")
|
||||
logger.debug(f"LLM call: model={final_model}, base_url={client.base_url}, response_type={response_type}")
|
||||
|
||||
try:
|
||||
response = await client.chat.completions.create(
|
||||
model=final_model,
|
||||
if response_type is not None:
|
||||
# Structured output mode - try beta.chat.completions.parse first
|
||||
return await self._call_with_structured_output(
|
||||
client=client,
|
||||
model=final_model,
|
||||
prompt=prompt,
|
||||
response_type=response_type,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
**kwargs
|
||||
)
|
||||
else:
|
||||
# Standard text output mode
|
||||
response = await client.chat.completions.create(
|
||||
model=final_model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
result = response.choices[0].message.content
|
||||
logger.debug(f"LLM response length: {len(result)} chars")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
|
||||
raise
|
||||
|
||||
async def _call_with_structured_output(
|
||||
self,
|
||||
client: AsyncOpenAI,
|
||||
model: str,
|
||||
prompt: str,
|
||||
response_type: Type[T],
|
||||
temperature: float,
|
||||
max_tokens: int,
|
||||
**kwargs
|
||||
) -> T:
|
||||
"""
|
||||
Call LLM with structured output support
|
||||
|
||||
Tries OpenAI beta.chat.completions.parse first, falls back to JSON parsing
|
||||
if the provider doesn't support structured outputs.
|
||||
|
||||
Args:
|
||||
client: OpenAI client
|
||||
model: Model name
|
||||
prompt: The prompt
|
||||
response_type: Pydantic model class
|
||||
temperature: Sampling temperature
|
||||
max_tokens: Max tokens
|
||||
**kwargs: Additional parameters
|
||||
|
||||
Returns:
|
||||
Parsed Pydantic model instance
|
||||
"""
|
||||
# Try OpenAI structured output API first (beta.chat.completions.parse)
|
||||
try:
|
||||
response = await client.beta.chat.completions.parse(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format=response_type,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
result = response.choices[0].message.content
|
||||
logger.debug(f"LLM response length: {len(result)} chars")
|
||||
parsed = response.choices[0].message.parsed
|
||||
if parsed is not None:
|
||||
logger.debug(f"Structured output parsed successfully via beta API")
|
||||
return parsed
|
||||
|
||||
# If parsed is None, fall through to fallback
|
||||
logger.warning("Structured output API returned None, falling back to JSON parsing")
|
||||
content = response.choices[0].message.content
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM call error (model={final_model}, base_url={client.base_url}): {e}")
|
||||
raise
|
||||
# If beta API not supported, fall back to JSON mode
|
||||
logger.debug(f"Structured output API not available ({e}), falling back to JSON parsing")
|
||||
|
||||
response = await client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
**kwargs
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# Fallback: Parse JSON from response content
|
||||
return self._parse_response_as_model(content, response_type)
|
||||
|
||||
def _parse_response_as_model(self, content: str, response_type: Type[T]) -> T:
|
||||
"""
|
||||
Parse LLM response content as Pydantic model
|
||||
|
||||
Args:
|
||||
content: Raw LLM response text
|
||||
response_type: Target Pydantic model class
|
||||
|
||||
Returns:
|
||||
Parsed model instance
|
||||
"""
|
||||
# Try direct JSON parsing first
|
||||
try:
|
||||
data = json.loads(content)
|
||||
return response_type.model_validate(data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try extracting from markdown code block
|
||||
json_pattern = r'```(?:json)?\s*([\s\S]+?)\s*```'
|
||||
match = re.search(json_pattern, content, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(1))
|
||||
return response_type.model_validate(data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try to find any JSON object in the text
|
||||
brace_start = content.find('{')
|
||||
brace_end = content.rfind('}')
|
||||
if brace_start != -1 and brace_end > brace_start:
|
||||
try:
|
||||
json_str = content[brace_start:brace_end + 1]
|
||||
data = json.loads(json_str)
|
||||
return response_type.model_validate(data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
raise ValueError(f"Failed to parse LLM response as {response_type.__name__}: {content[:200]}...")
|
||||
|
||||
@property
|
||||
def active(self) -> str:
|
||||
|
||||
Reference in New Issue
Block a user