Files
AI-Video/pixelle_video/services/frame_processor.py

503 lines
20 KiB
Python

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Frame processor - Process single frame through complete pipeline
Orchestrates: TTS → Image Generation → Frame Composition → Video Segment
Key Feature:
- TTS-driven video duration: Audio duration from TTS is passed to video generation workflows
to ensure perfect sync between audio and video (no padding, no trimming needed)
"""
from typing import Callable, Optional
import httpx
from loguru import logger
from pixelle_video.models.progress import ProgressEvent
from pixelle_video.models.storyboard import Storyboard, StoryboardFrame, StoryboardConfig
from pixelle_video.services.quality import (
QualityGate,
QualityConfig,
RetryManager,
RetryConfig,
QualityError,
)
class FrameProcessor:
"""Frame processor"""
def __init__(
self,
pixelle_video_core,
quality_config: Optional[QualityConfig] = None,
retry_config: Optional[RetryConfig] = None,
enable_quality_check: bool = True,
):
"""
Initialize
Args:
pixelle_video_core: PixelleVideoCore instance
quality_config: Quality evaluation configuration
retry_config: Retry behavior configuration
enable_quality_check: Whether to enable quality checking
"""
self.core = pixelle_video_core
self.enable_quality_check = enable_quality_check
self.quality_gate = QualityGate(
llm_service=pixelle_video_core.llm if hasattr(pixelle_video_core, 'llm') else None,
config=quality_config or QualityConfig()
)
self.retry_manager = RetryManager(config=retry_config or RetryConfig())
async def __call__(
self,
frame: StoryboardFrame,
storyboard: 'Storyboard',
config: StoryboardConfig,
total_frames: int = 1,
progress_callback: Optional[Callable[[ProgressEvent], None]] = None
) -> StoryboardFrame:
"""
Process single frame through complete pipeline
Steps:
1. Generate audio (TTS)
2. Generate image (ComfyKit)
3. Compose frame (add subtitle)
4. Create video segment (image + audio)
Args:
frame: Storyboard frame to process
storyboard: Storyboard instance
config: Storyboard configuration
total_frames: Total number of frames in storyboard
progress_callback: Optional callback for progress updates (receives ProgressEvent)
Returns:
Processed frame with all paths filled
"""
logger.info(f"Processing frame {frame.index}...")
frame_num = frame.index + 1
# Determine if this frame needs image generation
# If image_path or video_path is already set (e.g. asset-based pipeline), we consider it "has existing media" but skip generation
has_existing_media = frame.image_path is not None or frame.video_path is not None
needs_generation = frame.image_prompt is not None
try:
# Step 1: Generate audio (TTS)
if not frame.audio_path:
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.0,
frame_current=frame_num,
frame_total=total_frames,
step=1,
action="audio"
))
await self._step_generate_audio(frame, config)
else:
logger.debug(f" 1/4: Using existing audio: {frame.audio_path}")
# Step 2: Generate media (image or video, conditional)
if needs_generation:
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.25,
frame_current=frame_num,
frame_total=total_frames,
step=2,
action="media"
))
await self._step_generate_media(frame, config)
elif has_existing_media:
# Log appropriate message based on media type
if frame.video_path:
logger.debug(f" 2/4: Using existing video: {frame.video_path}")
else:
logger.debug(f" 2/4: Using existing image: {frame.image_path}")
else:
frame.image_path = None
frame.media_type = None
logger.debug(f" 2/4: Skipped media generation (not required by template)")
# Step 3: Compose frame (add subtitle)
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.50 if (needs_generation or has_existing_media) else 0.33,
frame_current=frame_num,
frame_total=total_frames,
step=3,
action="compose"
))
await self._step_compose_frame(frame, storyboard, config)
# Step 4: Create video segment
if progress_callback:
progress_callback(ProgressEvent(
event_type="frame_step",
progress=0.75 if (needs_generation or has_existing_media) else 0.67,
frame_current=frame_num,
frame_total=total_frames,
step=4,
action="video"
))
await self._step_create_video_segment(frame, config)
logger.info(f"✅ Frame {frame.index} completed")
return frame
except Exception as e:
logger.error(f"❌ Failed to process frame {frame.index}: {e}")
raise
async def _step_generate_audio(
self,
frame: StoryboardFrame,
config: StoryboardConfig
):
"""Step 1: Generate audio using TTS"""
logger.debug(f" 1/4: Generating audio for frame {frame.index}...")
# Generate output path using task_id
from pixelle_video.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(config.task_id, frame.index, "audio")
# Build TTS params based on inference mode
tts_params = {
"text": frame.narration,
"inference_mode": config.tts_inference_mode,
"output_path": output_path,
"index": frame.index + 1, # 1-based index for workflow
}
if config.tts_inference_mode == "local":
# Local mode: pass voice and speed
if config.voice_id:
tts_params["voice"] = config.voice_id
if config.tts_speed is not None:
tts_params["speed"] = config.tts_speed
else: # comfyui
# ComfyUI mode: pass workflow, voice, speed, and ref_audio
if config.tts_workflow:
tts_params["workflow"] = config.tts_workflow
if config.voice_id:
tts_params["voice"] = config.voice_id
if config.tts_speed is not None:
tts_params["speed"] = config.tts_speed
if config.ref_audio:
tts_params["ref_audio"] = config.ref_audio
audio_path = await self.core.tts(**tts_params)
frame.audio_path = audio_path
# Get audio duration
frame.duration = await self._get_audio_duration(audio_path)
logger.debug(f" ✓ Audio generated: {audio_path} ({frame.duration:.2f}s)")
async def _step_generate_media(
self,
frame: StoryboardFrame,
config: StoryboardConfig
):
"""
Step 2: Generate media (image or video) using ComfyKit
Enhanced with quality evaluation and retry logic.
"""
logger.debug(f" 2/4: Generating media for frame {frame.index}...")
# Determine media type based on workflow
workflow_name = config.media_workflow or ""
is_video_workflow = "video_" in workflow_name.lower()
media_type = "video" if is_video_workflow else "image"
logger.debug(f" → Media type: {media_type} (workflow: {workflow_name})")
# Build media generation parameters
media_params = {
"prompt": frame.image_prompt,
"workflow": config.media_workflow,
"media_type": media_type,
"width": config.media_width,
"height": config.media_height,
"index": frame.index + 1,
}
if is_video_workflow and frame.duration:
media_params["duration"] = frame.duration
logger.info(f" → Generating video with target duration: {frame.duration:.2f}s")
# Define generation operation
async def generate_and_download():
media_result = await self.core.media(**media_params)
local_path = await self._download_media(
media_result.url,
frame.index,
config.task_id,
media_type=media_result.media_type
)
return (media_result, local_path)
# Define quality evaluator
async def evaluate_quality(result):
media_result, local_path = result
if media_result.is_video:
return await self.quality_gate.evaluate_video(
local_path, frame.image_prompt, frame.narration
)
else:
return await self.quality_gate.evaluate_image(
local_path, frame.image_prompt, frame.narration
)
# Execute with retry and quality check
if self.enable_quality_check:
try:
retry_result = await self.retry_manager.execute_with_retry(
operation=generate_and_download,
quality_evaluator=evaluate_quality,
operation_name=f"frame_{frame.index}_media",
)
media_result, local_path = retry_result.result
# Store quality metrics on frame
if retry_result.quality_score:
frame.quality_score = retry_result.quality_score.overall_score
frame.quality_issues = retry_result.quality_score.issues
frame.retry_count = retry_result.attempts - 1 # first attempt is not a retry
except QualityError as e:
logger.warning(f" ⚠ Quality check failed after retries: {e}")
# Still try to use the last result if available
media_result, local_path = await generate_and_download()
frame.quality_issues = [str(e)]
else:
# Quality check disabled - just generate
media_result, local_path = await generate_and_download()
# Store results on frame
frame.media_type = media_result.media_type
if media_result.is_image:
frame.image_path = local_path
logger.debug(f" ✓ Image generated: {local_path}")
elif media_result.is_video:
frame.video_path = local_path
if media_result.duration:
frame.duration = media_result.duration
else:
frame.duration = await self._get_video_duration(local_path)
logger.debug(f" ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)")
else:
raise ValueError(f"Unknown media type: {media_result.media_type}")
# Log quality result
if frame.quality_score is not None:
logger.info(
f" 📊 Quality: {frame.quality_score:.2f} "
f"(retries: {frame.retry_count}, issues: {len(frame.quality_issues or [])})"
)
async def _step_compose_frame(
self,
frame: StoryboardFrame,
storyboard: 'Storyboard',
config: StoryboardConfig
):
"""Step 3: Compose frame with subtitle using HTML template"""
logger.debug(f" 3/4: Composing frame {frame.index}...")
# Generate output path using task_id
from pixelle_video.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(config.task_id, frame.index, "composed")
# For video type: render HTML as transparent overlay image
# For image type: render HTML with image background
# In both cases, we need the composed image
composed_path = await self._compose_frame_html(frame, storyboard, config, output_path)
frame.composed_image_path = composed_path
logger.debug(f" ✓ Frame composed: {composed_path}")
async def _compose_frame_html(
self,
frame: StoryboardFrame,
storyboard: 'Storyboard',
config: StoryboardConfig,
output_path: str
) -> str:
"""Compose frame using HTML template"""
from pixelle_video.services.frame_html import HTMLFrameGenerator
from pixelle_video.utils.template_util import resolve_template_path
# Resolve template path (handles various input formats)
template_path = resolve_template_path(config.frame_template)
# Get content metadata from storyboard
content_metadata = storyboard.content_metadata if storyboard else None
# Build ext data
ext = {
"index": frame.index + 1,
}
# Add custom template parameters
if config.template_params:
ext.update(config.template_params)
# Generate frame using HTML (size is auto-parsed from template path)
generator = HTMLFrameGenerator(template_path)
# Use video_path for video media, image_path for images
media_path = frame.video_path if frame.media_type == "video" else frame.image_path
logger.debug(f"Generating frame with media: '{media_path}' (type: {frame.media_type})")
composed_path = await generator.generate_frame(
title=storyboard.title,
text=frame.narration,
image=media_path, # HTMLFrameGenerator handles both image and video paths
ext=ext,
output_path=output_path
)
return composed_path
async def _step_create_video_segment(
self,
frame: StoryboardFrame,
config: StoryboardConfig
):
"""Step 4: Create video segment from media + audio"""
logger.debug(f" 4/4: Creating video segment for frame {frame.index}...")
# Generate output path using task_id
from pixelle_video.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(config.task_id, frame.index, "segment")
from pixelle_video.services.video import VideoService
video_service = VideoService()
# Branch based on media type
if frame.media_type == "video":
# Video workflow: overlay HTML template on video, then add audio
logger.debug(f" → Using video-based composition with HTML overlay")
# Step 1: Overlay transparent HTML image on video
# The composed_image_path contains the rendered HTML with transparent background
temp_video_with_overlay = get_task_frame_path(config.task_id, frame.index, "video") + "_overlay.mp4"
video_service.overlay_image_on_video(
video=frame.video_path,
overlay_image=frame.composed_image_path,
output=temp_video_with_overlay,
scale_mode="contain" # Scale video to fit template size (contain mode)
)
# Step 2: Add narration audio to the overlaid video
# Note: The video might have audio (replaced) or be silent (audio added)
segment_path = video_service.merge_audio_video(
video=temp_video_with_overlay,
audio=frame.audio_path,
output=output_path,
replace_audio=True, # Replace video audio with narration
audio_volume=1.0
)
# Clean up temp file
import os
if os.path.exists(temp_video_with_overlay):
os.unlink(temp_video_with_overlay)
elif frame.media_type == "image" or frame.media_type is None:
# Image workflow: Use composed image directly
# The asset_default.html template includes the image in the composition
logger.debug(f" → Using image-based composition")
segment_path = video_service.create_video_from_image(
image=frame.composed_image_path,
audio=frame.audio_path,
output=output_path,
fps=config.video_fps
)
else:
raise ValueError(f"Unknown media type: {frame.media_type}")
frame.video_segment_path = segment_path
logger.debug(f" ✓ Video segment created: {segment_path}")
async def _get_audio_duration(self, audio_path: str) -> float:
"""Get audio duration in seconds"""
try:
# Try using ffmpeg-python
import ffmpeg
probe = ffmpeg.probe(audio_path)
duration = float(probe['format']['duration'])
return duration
except Exception as e:
logger.warning(f"Failed to get audio duration: {e}, using estimate")
# Fallback: estimate based on file size (very rough)
import os
file_size = os.path.getsize(audio_path)
# Assume ~16kbps for MP3, so 2KB per second
estimated_duration = file_size / 2000
return max(1.0, estimated_duration) # At least 1 second
async def _download_media(
self,
url: str,
frame_index: int,
task_id: str,
media_type: str
) -> str:
"""Download media (image or video) from URL to local file"""
from pixelle_video.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(task_id, frame_index, media_type)
timeout = httpx.Timeout(connect=10.0, read=60, write=60, pool=60)
async with httpx.AsyncClient(timeout=timeout) as client:
response = await client.get(url)
response.raise_for_status()
with open(output_path, 'wb') as f:
f.write(response.content)
return output_path
async def _get_video_duration(self, video_path: str) -> float:
"""Get video duration in seconds"""
try:
import ffmpeg
probe = ffmpeg.probe(video_path)
duration = float(probe['format']['duration'])
return duration
except Exception as e:
logger.warning(f"Failed to get video duration: {e}, using audio duration")
# Fallback: use audio duration if available
return 1.0 # Default to 1 second if unable to determine