diff --git a/pixelle_video/services/frame_processor.py b/pixelle_video/services/frame_processor.py index 09bd1da..f7b64ae 100644 --- a/pixelle_video/services/frame_processor.py +++ b/pixelle_video/services/frame_processor.py @@ -14,6 +14,10 @@ Frame processor - Process single frame through complete pipeline Orchestrates: TTS → Image Generation → Frame Composition → Video Segment + +Key Feature: +- TTS-driven video duration: Audio duration from TTS is passed to video generation workflows + to ensure perfect sync between audio and video (no padding, no trimming needed) """ from typing import Callable, Optional @@ -193,14 +197,23 @@ class FrameProcessor: logger.debug(f" → Media type: {media_type} (workflow: {workflow_name})") - # Call Media generation (with optional preset) - media_result = await self.core.media( - prompt=frame.image_prompt, - workflow=config.media_workflow, # Pass workflow from config (None = use default) - media_type=media_type, - width=config.media_width, - height=config.media_height - ) + # Build media generation parameters + media_params = { + "prompt": frame.image_prompt, + "workflow": config.media_workflow, # Pass workflow from config (None = use default) + "media_type": media_type, + "width": config.media_width, + "height": config.media_height + } + + # For video workflows: pass audio duration as target video duration + # This ensures video length matches audio length from the source + if is_video_workflow and frame.duration: + media_params["duration"] = frame.duration + logger.info(f" → Generating video with target duration: {frame.duration:.2f}s (from TTS audio)") + + # Call Media generation + media_result = await self.core.media(**media_params) # Store media type frame.media_type = media_result.media_type diff --git a/pixelle_video/services/media.py b/pixelle_video/services/media.py index 75d9e33..d894339 100644 --- a/pixelle_video/services/media.py +++ b/pixelle_video/services/media.py @@ -119,6 +119,7 @@ class MediaService(ComfyBaseService): # Common workflow parameters width: Optional[int] = None, height: Optional[int] = None, + duration: Optional[float] = None, # Video duration in seconds (for video workflows) negative_prompt: Optional[str] = None, steps: Optional[int] = None, seed: Optional[int] = None, @@ -140,6 +141,7 @@ class MediaService(ComfyBaseService): runninghub_api_key: RunningHub API key (optional, overrides config) width: Media width height: Media height + duration: Target video duration in seconds (only for video workflows, typically from TTS audio duration) negative_prompt: Negative prompt steps: Sampling steps seed: Random seed @@ -203,6 +205,10 @@ class MediaService(ComfyBaseService): workflow_params["width"] = width if height is not None: workflow_params["height"] = height + if duration is not None: + workflow_params["duration"] = duration + if media_type == "video": + logger.info(f"📏 Target video duration: {duration:.2f}s (from TTS audio)") if negative_prompt is not None: workflow_params["negative_prompt"] = negative_prompt if steps is not None: