diff --git a/api/routers/image.py b/api/routers/image.py index 92f23b1..f9bd460 100644 --- a/api/routers/image.py +++ b/api/routers/image.py @@ -43,18 +43,27 @@ async def image_generate( try: logger.info(f"Image generation request: {request.prompt[:50]}...") - # Call image service - image_path = await pixelle_video.image( + # Call media service (backward compatible with image API) + media_result = await pixelle_video.media( prompt=request.prompt, width=request.width, height=request.height, workflow=request.workflow ) + # For backward compatibility, only support image results in /image endpoint + if media_result.is_video: + raise HTTPException( + status_code=400, + detail="Video workflow used. Please use /media/generate endpoint for video generation." + ) + return ImageGenerateResponse( - image_path=image_path + image_path=media_result.url ) + except HTTPException: + raise except Exception as e: logger.error(f"Image generation error: {e}") raise HTTPException(status_code=500, detail=str(e)) diff --git a/pixelle_video/models/media.py b/pixelle_video/models/media.py new file mode 100644 index 0000000..2c7eca9 --- /dev/null +++ b/pixelle_video/models/media.py @@ -0,0 +1,61 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Media generation result models +""" + +from typing import Literal, Optional +from pydantic import BaseModel, Field + + +class MediaResult(BaseModel): + """ + Media generation result from workflow execution + + Supports both image and video outputs from ComfyUI workflows. + The media_type indicates what kind of media was generated. + + Attributes: + media_type: Type of media generated ("image" or "video") + url: URL or path to the generated media + duration: Duration in seconds (only for video, None for image) + + Examples: + # Image result + MediaResult(media_type="image", url="http://example.com/image.png") + + # Video result + MediaResult(media_type="video", url="http://example.com/video.mp4", duration=5.2) + """ + + media_type: Literal["image", "video"] = Field( + description="Type of generated media" + ) + url: str = Field( + description="URL or path to the generated media file" + ) + duration: Optional[float] = Field( + None, + description="Duration in seconds (only applicable for video)" + ) + + @property + def is_image(self) -> bool: + """Check if this is an image result""" + return self.media_type == "image" + + @property + def is_video(self) -> bool: + """Check if this is a video result""" + return self.media_type == "video" + diff --git a/pixelle_video/models/storyboard.py b/pixelle_video/models/storyboard.py index 6ef3f7d..1204991 100644 --- a/pixelle_video/models/storyboard.py +++ b/pixelle_video/models/storyboard.py @@ -57,16 +57,18 @@ class StoryboardFrame: """Single storyboard frame""" index: int # Frame index (0-based) narration: str # Narration text - image_prompt: str # Image generation prompt + image_prompt: str # Image generation prompt (can be None for text-only or video) # Generated resource paths - audio_path: Optional[str] = None # Audio file path - image_path: Optional[str] = None # Original image path - composed_image_path: Optional[str] = None # Composed image path (with subtitles) - video_segment_path: Optional[str] = None # Video segment path + audio_path: Optional[str] = None # Audio file path (narration) + media_type: Optional[str] = None # Media type: "image" or "video" (None if no media) + image_path: Optional[str] = None # Original image path (for image type) + video_path: Optional[str] = None # Original video path (for video type, before composition) + composed_image_path: Optional[str] = None # Composed image path (with subtitles, for image type) + video_segment_path: Optional[str] = None # Final video segment path # Metadata - duration: float = 0.0 # Audio duration (seconds) + duration: float = 0.0 # Frame duration (seconds, from audio or video) created_at: Optional[datetime] = None def __post_init__(self): diff --git a/pixelle_video/pipelines/base.py b/pixelle_video/pipelines/base.py index 59493cd..b511e48 100644 --- a/pixelle_video/pipelines/base.py +++ b/pixelle_video/pipelines/base.py @@ -63,8 +63,11 @@ class BasePipeline(ABC): # Quick access to services (convenience) self.llm = pixelle_video_core.llm self.tts = pixelle_video_core.tts - self.image = pixelle_video_core.image + self.media = pixelle_video_core.media self.video = pixelle_video_core.video + + # Backward compatibility alias + self.image = pixelle_video_core.media @abstractmethod async def __call__( diff --git a/pixelle_video/pipelines/standard.py b/pixelle_video/pipelines/standard.py index 44af54c..5864659 100644 --- a/pixelle_video/pipelines/standard.py +++ b/pixelle_video/pipelines/standard.py @@ -269,11 +269,13 @@ class StandardPipeline(BasePipeline): ) # ========== Step 0.8: Check template requirements ========== - template_requires_image = self._check_template_requires_image(config.frame_template) - if template_requires_image: + template_media_type = self._check_template_media_type(config.frame_template) + if template_media_type == "video": + logger.info(f"🎬 Template requires video generation") + elif template_media_type == "image": logger.info(f"📸 Template requires image generation") - else: - logger.info(f"⚡ Template does not require images - skipping image generation pipeline") + else: # text + logger.info(f"⚡ Template does not require media - skipping media generation pipeline") logger.info(f" 💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency") try: @@ -294,8 +296,61 @@ class StandardPipeline(BasePipeline): logger.info(f"✅ Split script into {len(narrations)} segments (by lines)") logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode") - # ========== Step 2: Generate image prompts (conditional) ========== - if template_requires_image: + # ========== Step 2: Generate media prompts (conditional) ========== + if template_media_type == "video": + # Video template: generate video prompts + self._report_progress(progress_callback, "generating_video_prompts", 0.15) + + from pixelle_video.utils.content_generators import generate_video_prompts + + # Override prompt_prefix if provided + original_prefix = None + if prompt_prefix is not None: + image_config = self.core.config.get("comfyui", {}).get("image", {}) + original_prefix = image_config.get("prompt_prefix") + image_config["prompt_prefix"] = prompt_prefix + logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'") + + try: + # Create progress callback wrapper for video prompt generation + def video_prompt_progress(completed: int, total: int, message: str): + batch_progress = completed / total if total > 0 else 0 + overall_progress = 0.15 + (batch_progress * 0.15) + self._report_progress( + progress_callback, + "generating_video_prompts", + overall_progress, + extra_info=message + ) + + # Generate base video prompts + base_image_prompts = await generate_video_prompts( + self.llm, + narrations=narrations, + min_words=min_image_prompt_words, + max_words=max_image_prompt_words, + progress_callback=video_prompt_progress + ) + + # Apply prompt prefix + from pixelle_video.utils.prompt_helper import build_image_prompt + image_config = self.core.config.get("comfyui", {}).get("image", {}) + prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "") + + image_prompts = [] + for base_prompt in base_image_prompts: + final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use) + image_prompts.append(final_prompt) + + finally: + # Restore original prompt_prefix + if original_prefix is not None: + image_config["prompt_prefix"] = original_prefix + + logger.info(f"✅ Generated {len(image_prompts)} video prompts") + + elif template_media_type == "image": + # Image template: generate image prompts self._report_progress(progress_callback, "generating_image_prompts", 0.15) # Override prompt_prefix if provided @@ -343,12 +398,13 @@ class StandardPipeline(BasePipeline): image_config["prompt_prefix"] = original_prefix logger.info(f"✅ Generated {len(image_prompts)} image prompts") - else: - # Skip image prompt generation + + else: # text + # Text-only template: skip media prompt generation image_prompts = [None] * len(narrations) self._report_progress(progress_callback, "preparing_frames", 0.15) - logger.info(f"⚡ Skipped image prompt generation (template doesn't need images)") - logger.info(f" 💡 Savings: {len(narrations)} LLM calls + {len(narrations)} image generations") + logger.info(f"⚡ Skipped media prompt generation (text-only template)") + logger.info(f" 💡 Savings: {len(narrations)} LLM calls + {len(narrations)} media generations") # ========== Step 3: Create frames ========== for i, (narration, image_prompt) in enumerate(zip(narrations, image_prompts)): @@ -452,29 +508,44 @@ class StandardPipeline(BasePipeline): logger.error(f"❌ Video generation failed: {e}") raise - def _check_template_requires_image(self, frame_template: str) -> bool: + def _check_template_media_type(self, frame_template: str) -> str: """ - Check if template requires image generation + Check template media type requirement This is checked at pipeline level to avoid unnecessary: - - LLM calls (generating image_prompts) - - Image generation API calls + - LLM calls (generating media prompts) + - Media generation API calls - ComfyUI dependency + Template naming rules: + - video_*.html: Video template (returns "video") + - Other templates with {{image}}: Image template (returns "image") + - Other templates without {{image}}: Text-only template (returns "text") + Args: - frame_template: Template path (e.g., "1080x1920/default.html") + frame_template: Template path (e.g., "1080x1920/default.html" or "1080x1920/video_default.html") Returns: - True if template contains {{image}}, False otherwise + "video", "image", or "text" """ from pixelle_video.services.frame_html import HTMLFrameGenerator from pixelle_video.utils.template_util import resolve_template_path + # Check if template name starts with video_ + template_name = Path(frame_template).name + if template_name.startswith("video_"): + logger.debug(f"Template '{frame_template}' is video template (video_ prefix)") + return "video" + + # Check if template contains {{image}} template_path = resolve_template_path(frame_template) generator = HTMLFrameGenerator(template_path) - requires = generator.requires_image() - logger.debug(f"Template '{frame_template}' requires_image={requires}") - - return requires + requires_image = generator.requires_image() + if requires_image: + logger.debug(f"Template '{frame_template}' is image template (has {{image}})") + return "image" + else: + logger.debug(f"Template '{frame_template}' is text-only template") + return "text" diff --git a/pixelle_video/prompts/video_generation.py b/pixelle_video/prompts/video_generation.py new file mode 100644 index 0000000..f795012 --- /dev/null +++ b/pixelle_video/prompts/video_generation.py @@ -0,0 +1,133 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Video prompt generation template + +For generating video prompts from narrations. +""" + +import json +from typing import List + + +VIDEO_PROMPT_GENERATION_PROMPT = """# 角色定位 +你是一个专业的视频创意设计师,擅长为视频脚本创作富有动感和表现力的视频生成提示词,将叙述内容转化为生动的视频画面。 + +# 核心任务 +基于已有的视频脚本,为每个分镜的"旁白内容"创作对应的**英文**视频生成提示词,确保视频画面与叙述内容完美配合,通过动态画面增强观众的理解和记忆。 + +**重要:输入包含 {narrations_count} 个旁白,你必须为每个旁白都生成一个对应的视频提示词,总共输出 {narrations_count} 个视频提示词。** + +# 输入内容 +{narrations_json} + +# 输出要求 + +## 视频提示词规范 +- 语言:**必须使用英文**(用于 AI 视频生成模型) +- 描述结构:scene + character action + camera movement + emotion + atmosphere +- 描述长度:确保描述清晰完整且富有创意(建议 50-100 个英文单词) +- 动态元素:强调动作、运动、变化等动态效果 + +## 视觉创意要求 +- 每个视频都要准确反映对应旁白的具体内容和情感 +- 突出画面的动态性:角色动作、物体运动、镜头移动、场景转换等 +- 使用象征手法将抽象概念视觉化(如用流动的水代表时间流逝,用上升的阶梯代表进步等) +- 画面要表现出丰富的情感和动作,增强视觉冲击力 +- 通过镜头语言(推拉摇移)和剪辑节奏增强表现力 + +## 关键英文词汇参考 +- 动作:moving, running, flowing, transforming, growing, falling +- 镜头:camera pan, zoom in, zoom out, tracking shot, aerial view +- 转场:transition, fade in, fade out, dissolve +- 氛围:dynamic, energetic, peaceful, dramatic, mysterious +- 光影:lighting changes, shadows moving, sunlight streaming + +## 视频与文案配合原则 +- 视频要服务于文案,成为文案内容的视觉延伸 +- 避免与文案内容无关或矛盾的视觉元素 +- 选择最能增强文案说服力的动态表现方式 +- 确保观众能通过视频动态快速理解文案的核心观点 + +## 创意指导 +1. **现象描述类文案**:用动态场景表现社会现象的发生过程 +2. **原因分析类文案**:用因果关系的动态演变表现内在逻辑 +3. **影响论证类文案**:用后果场景的动态展开或对比表现影响程度 +4. **深入探讨类文案**:用抽象概念的动态具象化表现深刻思考 +5. **结论启发类文案**:用开放式动态场景或指引性运动表现启发性 + +## 视频特有注意事项 +- 强调动态:每个视频都应该包含明显的动作或运动 +- 镜头语言:适当使用推拉摇移等镜头技巧增强表现力 +- 时长考虑:视频应该是连贯的动态过程,不是静态画面 +- 流畅性:注意动作的流畅性和自然性 + +# 输出格式 +严格按照以下JSON格式输出,**视频提示词必须是英文**: + +```json +{{ + "video_prompts": [ + "[detailed English video prompt with dynamic elements and camera movements]", + "[detailed English video prompt with dynamic elements and camera movements]" + ] +}} +``` + +# 重要提醒 +1. 只输出JSON格式内容,不要添加任何解释说明 +2. 确保JSON格式严格正确,可以被程序直接解析 +3. 输入是 {{"narrations": [旁白数组]}} 格式,输出是 {{"video_prompts": [视频提示词数组]}} 格式 +4. **输出的video_prompts数组必须恰好包含 {narrations_count} 个元素,与输入的narrations数组一一对应** +5. **视频提示词必须使用英文**(for AI video generation models) +6. 视频提示词必须准确反映对应旁白的具体内容和情感 +7. 每个视频都要强调动态性和运动感,避免静态描述 +8. 适当使用镜头语言增强表现力 +9. 确保视频画面能增强文案的说服力和观众的理解度 + +现在,请为上述 {narrations_count} 个旁白创作对应的 {narrations_count} 个**英文**视频提示词。只输出JSON,不要其他内容。 +""" + + +def build_video_prompt_prompt( + narrations: List[str], + min_words: int, + max_words: int +) -> str: + """ + Build video prompt generation prompt + + Args: + narrations: List of narrations + min_words: Minimum word count + max_words: Maximum word count + + Returns: + Formatted prompt for LLM + + Example: + >>> build_video_prompt_prompt(narrations, 50, 100) + """ + narrations_json = json.dumps( + {"narrations": narrations}, + ensure_ascii=False, + indent=2 + ) + + return VIDEO_PROMPT_GENERATION_PROMPT.format( + narrations_json=narrations_json, + narrations_count=len(narrations), + min_words=min_words, + max_words=max_words + ) + diff --git a/pixelle_video/service.py b/pixelle_video/service.py index 5cdae0e..d2ca202 100644 --- a/pixelle_video/service.py +++ b/pixelle_video/service.py @@ -23,7 +23,7 @@ from loguru import logger from pixelle_video.config import config_manager from pixelle_video.services.llm_service import LLMService from pixelle_video.services.tts_service import TTSService -from pixelle_video.services.image import ImageService +from pixelle_video.services.media import MediaService from pixelle_video.services.video import VideoService from pixelle_video.services.frame_processor import FrameProcessor from pixelle_video.pipelines.standard import StandardPipeline @@ -45,7 +45,7 @@ class PixelleVideoCore: # Use capabilities directly answer = await pixelle_video.llm("Explain atomic habits") audio = await pixelle_video.tts("Hello world") - image = await pixelle_video.image(prompt="a cat") + media = await pixelle_video.media(prompt="a cat") # Check active capabilities print(f"Using LLM: {pixelle_video.llm.active}") @@ -56,7 +56,7 @@ class PixelleVideoCore: ├── config (configuration) ├── llm (LLM service - direct OpenAI SDK) ├── tts (TTS service - ComfyKit workflows) - ├── image (Image service - ComfyKit workflows) + ├── media (Media service - ComfyKit workflows, supports image & video) └── pipelines (video generation pipelines) ├── standard (standard workflow) ├── custom (custom workflow template) @@ -77,7 +77,7 @@ class PixelleVideoCore: # Core services (initialized in initialize()) self.llm: Optional[LLMService] = None self.tts: Optional[TTSService] = None - self.image: Optional[ImageService] = None + self.media: Optional[MediaService] = None self.video: Optional[VideoService] = None self.frame_processor: Optional[FrameProcessor] = None @@ -105,7 +105,7 @@ class PixelleVideoCore: # 1. Initialize core services self.llm = LLMService(self.config) self.tts = TTSService(self.config) - self.image = ImageService(self.config) + self.media = MediaService(self.config) self.video = VideoService() self.frame_processor = FrameProcessor(self) diff --git a/pixelle_video/services/__init__.py b/pixelle_video/services/__init__.py index fd4d282..77979c5 100644 --- a/pixelle_video/services/__init__.py +++ b/pixelle_video/services/__init__.py @@ -18,7 +18,7 @@ Core services providing atomic capabilities. Services: - LLMService: LLM text generation - TTSService: Text-to-speech -- ImageService: Image generation +- MediaService: Media generation (image & video) - VideoService: Video processing - FrameProcessor: Frame processing orchestrator - ComfyBaseService: Base class for ComfyUI-based services @@ -27,15 +27,19 @@ Services: from pixelle_video.services.comfy_base_service import ComfyBaseService from pixelle_video.services.llm_service import LLMService from pixelle_video.services.tts_service import TTSService -from pixelle_video.services.image import ImageService +from pixelle_video.services.media import MediaService from pixelle_video.services.video import VideoService from pixelle_video.services.frame_processor import FrameProcessor +# Backward compatibility alias +ImageService = MediaService + __all__ = [ "ComfyBaseService", "LLMService", "TTSService", - "ImageService", + "MediaService", + "ImageService", # Backward compatibility "VideoService", "FrameProcessor", ] diff --git a/pixelle_video/services/frame_processor.py b/pixelle_video/services/frame_processor.py index 095eaf1..4767663 100644 --- a/pixelle_video/services/frame_processor.py +++ b/pixelle_video/services/frame_processor.py @@ -84,7 +84,7 @@ class FrameProcessor: )) await self._step_generate_audio(frame, config) - # Step 2: Generate image (conditional) + # Step 2: Generate media (image or video, conditional) if needs_image: if progress_callback: progress_callback(ProgressEvent( @@ -93,12 +93,13 @@ class FrameProcessor: frame_current=frame_num, frame_total=total_frames, step=2, - action="image" + action="media" )) - await self._step_generate_image(frame, config) + await self._step_generate_media(frame, config) else: frame.image_path = None - logger.debug(f" 2/4: Skipped image generation (not required by template)") + frame.media_type = None + logger.debug(f" 2/4: Skipped media generation (not required by template)") # Step 3: Compose frame (add subtitle) if progress_callback: @@ -176,27 +177,66 @@ class FrameProcessor: logger.debug(f" ✓ Audio generated: {audio_path} ({frame.duration:.2f}s)") - async def _step_generate_image( + async def _step_generate_media( self, frame: StoryboardFrame, config: StoryboardConfig ): - """Step 2: Generate image using ComfyKit""" - logger.debug(f" 2/4: Generating image for frame {frame.index}...") + """Step 2: Generate media (image or video) using ComfyKit""" + logger.debug(f" 2/4: Generating media for frame {frame.index}...") - # Call Image generation (with optional preset) - image_url = await self.core.image( + # Determine media type based on workflow + # video_ prefix in workflow name indicates video generation + workflow_name = config.image_workflow or "" + is_video_workflow = "video_" in workflow_name.lower() + media_type = "video" if is_video_workflow else "image" + + logger.debug(f" → Media type: {media_type} (workflow: {workflow_name})") + + # Call Media generation (with optional preset) + media_result = await self.core.media( prompt=frame.image_prompt, workflow=config.image_workflow, # Pass workflow from config (None = use default) + media_type=media_type, width=config.image_width, height=config.image_height ) - # Download image to local (pass task_id) - local_path = await self._download_image(image_url, frame.index, config.task_id) - frame.image_path = local_path + # Store media type + frame.media_type = media_result.media_type - logger.debug(f" ✓ Image generated: {local_path}") + if media_result.is_image: + # Download image to local (pass task_id) + local_path = await self._download_media( + media_result.url, + frame.index, + config.task_id, + media_type="image" + ) + frame.image_path = local_path + logger.debug(f" ✓ Image generated: {local_path}") + + elif media_result.is_video: + # Download video to local (pass task_id) + local_path = await self._download_media( + media_result.url, + frame.index, + config.task_id, + media_type="video" + ) + frame.video_path = local_path + + # Update duration from video if available + if media_result.duration: + frame.duration = media_result.duration + logger.debug(f" ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)") + else: + # Get video duration from file + frame.duration = await self._get_video_duration(local_path) + logger.debug(f" ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)") + + else: + raise ValueError(f"Unknown media type: {media_result.media_type}") async def _step_compose_frame( self, @@ -211,7 +251,9 @@ class FrameProcessor: from pixelle_video.utils.os_util import get_task_frame_path output_path = get_task_frame_path(config.task_id, frame.index, "composed") - # Use HTML template to compose frame + # For video type: render HTML as transparent overlay image + # For image type: render HTML with image background + # In both cases, we need the composed image composed_path = await self._compose_frame_html(frame, storyboard, config, output_path) frame.composed_image_path = composed_path @@ -264,23 +306,60 @@ class FrameProcessor: frame: StoryboardFrame, config: StoryboardConfig ): - """Step 4: Create video segment from image + audio""" + """Step 4: Create video segment from media + audio""" logger.debug(f" 4/4: Creating video segment for frame {frame.index}...") # Generate output path using task_id from pixelle_video.utils.os_util import get_task_frame_path output_path = get_task_frame_path(config.task_id, frame.index, "segment") - # Call video compositor to create video from image + audio from pixelle_video.services.video import VideoService video_service = VideoService() - segment_path = video_service.create_video_from_image( - image=frame.composed_image_path, - audio=frame.audio_path, - output=output_path, - fps=config.video_fps - ) + # Branch based on media type + if frame.media_type == "video": + # Video workflow: overlay HTML template on video, then add audio + logger.debug(f" → Using video-based composition with HTML overlay") + + # Step 1: Overlay transparent HTML image on video + # The composed_image_path contains the rendered HTML with transparent background + temp_video_with_overlay = get_task_frame_path(config.task_id, frame.index, "video") + "_overlay.mp4" + + video_service.overlay_image_on_video( + video=frame.video_path, + overlay_image=frame.composed_image_path, + output=temp_video_with_overlay, + scale_mode="contain" # Scale video to fit template size (contain mode) + ) + + # Step 2: Add narration audio to the overlaid video + # Note: The video might have audio (replaced) or be silent (audio added) + segment_path = video_service.merge_audio_video( + video=temp_video_with_overlay, + audio=frame.audio_path, + output=output_path, + replace_audio=True, # Replace video audio with narration + audio_volume=1.0 + ) + + # Clean up temp file + import os + if os.path.exists(temp_video_with_overlay): + os.unlink(temp_video_with_overlay) + + elif frame.media_type == "image" or frame.media_type is None: + # Image workflow: create video from image + audio + logger.debug(f" → Using image-based composition") + + segment_path = video_service.create_video_from_image( + image=frame.composed_image_path, + audio=frame.audio_path, + output=output_path, + fps=config.video_fps + ) + + else: + raise ValueError(f"Unknown media type: {frame.media_type}") frame.video_segment_path = segment_path @@ -303,10 +382,16 @@ class FrameProcessor: estimated_duration = file_size / 2000 return max(1.0, estimated_duration) # At least 1 second - async def _download_image(self, url: str, frame_index: int, task_id: str) -> str: - """Download image from URL to local file""" + async def _download_media( + self, + url: str, + frame_index: int, + task_id: str, + media_type: str + ) -> str: + """Download media (image or video) from URL to local file""" from pixelle_video.utils.os_util import get_task_frame_path - output_path = get_task_frame_path(task_id, frame_index, "image") + output_path = get_task_frame_path(task_id, frame_index, media_type) async with httpx.AsyncClient() as client: response = await client.get(url) @@ -316,4 +401,16 @@ class FrameProcessor: f.write(response.content) return output_path + + async def _get_video_duration(self, video_path: str) -> float: + """Get video duration in seconds""" + try: + import ffmpeg + probe = ffmpeg.probe(video_path) + duration = float(probe['format']['duration']) + return duration + except Exception as e: + logger.warning(f"Failed to get video duration: {e}, using audio duration") + # Fallback: use audio duration if available + return 1.0 # Default to 1 second if unable to determine diff --git a/pixelle_video/services/image.py b/pixelle_video/services/image.py deleted file mode 100644 index 83c2c72..0000000 --- a/pixelle_video/services/image.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright (C) 2025 AIDC-AI -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Image Generation Service - ComfyUI Workflow-based implementation -""" - -from typing import Optional - -from comfykit import ComfyKit -from loguru import logger - -from pixelle_video.services.comfy_base_service import ComfyBaseService - - -class ImageService(ComfyBaseService): - """ - Image generation service - Workflow-based - - Uses ComfyKit to execute image generation workflows. - - Usage: - # Use default workflow (workflows/image_flux.json) - image_url = await pixelle_video.image(prompt="a cat") - - # Use specific workflow - image_url = await pixelle_video.image( - prompt="a cat", - workflow="image_flux.json" - ) - - # List available workflows - workflows = pixelle_video.image.list_workflows() - """ - - WORKFLOW_PREFIX = "image_" - DEFAULT_WORKFLOW = None # No hardcoded default, must be configured - WORKFLOWS_DIR = "workflows" - - def __init__(self, config: dict): - """ - Initialize image service - - Args: - config: Full application config dict - """ - super().__init__(config, service_name="image") - - async def __call__( - self, - prompt: str, - workflow: Optional[str] = None, - # ComfyUI connection (optional overrides) - comfyui_url: Optional[str] = None, - runninghub_api_key: Optional[str] = None, - # Common workflow parameters - width: Optional[int] = None, - height: Optional[int] = None, - negative_prompt: Optional[str] = None, - steps: Optional[int] = None, - seed: Optional[int] = None, - cfg: Optional[float] = None, - sampler: Optional[str] = None, - **params - ) -> str: - """ - Generate image using workflow - - Args: - prompt: Image generation prompt - workflow: Workflow filename (default: from config or "image_flux.json") - comfyui_url: ComfyUI URL (optional, overrides config) - runninghub_api_key: RunningHub API key (optional, overrides config) - width: Image width - height: Image height - negative_prompt: Negative prompt - steps: Sampling steps - seed: Random seed - cfg: CFG scale - sampler: Sampler name - **params: Additional workflow parameters - - Returns: - Generated image URL/path - - Examples: - # Simplest: use default workflow (workflows/image_flux.json) - image_url = await pixelle_video.image(prompt="a beautiful cat") - - # Use specific workflow - image_url = await pixelle_video.image( - prompt="a cat", - workflow="image_flux.json" - ) - - # With additional parameters - image_url = await pixelle_video.image( - prompt="a cat", - workflow="image_flux.json", - width=1024, - height=1024, - steps=20, - seed=42 - ) - - # With absolute path - image_url = await pixelle_video.image( - prompt="a cat", - workflow="/path/to/custom.json" - ) - - # With custom ComfyUI server - image_url = await pixelle_video.image( - prompt="a cat", - comfyui_url="http://192.168.1.100:8188" - ) - """ - # 1. Resolve workflow (returns structured info) - workflow_info = self._resolve_workflow(workflow=workflow) - - # 2. Prepare ComfyKit config (supports both selfhost and runninghub) - kit_config = self._prepare_comfykit_config( - comfyui_url=comfyui_url, - runninghub_api_key=runninghub_api_key - ) - - # 3. Build workflow parameters - workflow_params = {"prompt": prompt} - - # Add optional parameters - if width is not None: - workflow_params["width"] = width - if height is not None: - workflow_params["height"] = height - if negative_prompt is not None: - workflow_params["negative_prompt"] = negative_prompt - if steps is not None: - workflow_params["steps"] = steps - if seed is not None: - workflow_params["seed"] = seed - if cfg is not None: - workflow_params["cfg"] = cfg - if sampler is not None: - workflow_params["sampler"] = sampler - - # Add any additional parameters - workflow_params.update(params) - - logger.debug(f"Workflow parameters: {workflow_params}") - - # 4. Execute workflow (ComfyKit auto-detects based on input type) - try: - kit = ComfyKit(**kit_config) - - # Determine what to pass to ComfyKit based on source - if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info: - # RunningHub: pass workflow_id (ComfyKit will use runninghub backend) - workflow_input = workflow_info["workflow_id"] - logger.info(f"Executing RunningHub workflow: {workflow_input}") - else: - # Selfhost: pass file path (ComfyKit will use local ComfyUI) - workflow_input = workflow_info["path"] - logger.info(f"Executing selfhost workflow: {workflow_input}") - - result = await kit.execute(workflow_input, workflow_params) - - # 5. Handle result - if result.status != "completed": - error_msg = result.msg or "Unknown error" - logger.error(f"Image generation failed: {error_msg}") - raise Exception(f"Image generation failed: {error_msg}") - - if not result.images: - logger.error("No images generated") - raise Exception("No images generated") - - image_url = result.images[0] - logger.info(f"✅ Generated image: {image_url}") - return image_url - - except Exception as e: - logger.error(f"Image generation error: {e}") - raise diff --git a/pixelle_video/services/media.py b/pixelle_video/services/media.py new file mode 100644 index 0000000..c915df1 --- /dev/null +++ b/pixelle_video/services/media.py @@ -0,0 +1,285 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Media Generation Service - ComfyUI Workflow-based implementation + +Supports both image and video generation workflows. +Automatically detects output type based on ExecuteResult. +""" + +from typing import Optional + +from comfykit import ComfyKit +from loguru import logger + +from pixelle_video.services.comfy_base_service import ComfyBaseService +from pixelle_video.models.media import MediaResult + + +class MediaService(ComfyBaseService): + """ + Media generation service - Workflow-based + + Uses ComfyKit to execute image/video generation workflows. + Supports both image_ and video_ workflow prefixes. + + Usage: + # Use default workflow (workflows/image_flux.json) + media = await pixelle_video.media(prompt="a cat") + if media.is_image: + print(f"Generated image: {media.url}") + elif media.is_video: + print(f"Generated video: {media.url} ({media.duration}s)") + + # Use specific workflow + media = await pixelle_video.media( + prompt="a cat", + workflow="image_flux.json" + ) + + # List available workflows + workflows = pixelle_video.media.list_workflows() + """ + + WORKFLOW_PREFIX = "" # Will be overridden by _scan_workflows + DEFAULT_WORKFLOW = None # No hardcoded default, must be configured + WORKFLOWS_DIR = "workflows" + + def __init__(self, config: dict): + """ + Initialize media service + + Args: + config: Full application config dict + """ + super().__init__(config, service_name="image") # Keep "image" for config compatibility + + def _scan_workflows(self): + """ + Scan workflows for both image_ and video_ prefixes + + Override parent method to support multiple prefixes + """ + from pixelle_video.utils.os_util import list_resource_dirs, list_resource_files, get_resource_path + from pathlib import Path + + workflows = [] + + # Get all workflow source directories + source_dirs = list_resource_dirs("workflows") + + if not source_dirs: + logger.warning("No workflow source directories found") + return workflows + + # Scan each source directory for workflow files + for source_name in source_dirs: + # Get all JSON files for this source + workflow_files = list_resource_files("workflows", source_name) + + # Filter to only files matching image_ or video_ prefix + matching_files = [ + f for f in workflow_files + if (f.startswith("image_") or f.startswith("video_")) and f.endswith('.json') + ] + + for filename in matching_files: + try: + # Get actual file path + file_path = Path(get_resource_path("workflows", source_name, filename)) + workflow_info = self._parse_workflow_file(file_path, source_name) + workflows.append(workflow_info) + logger.debug(f"Found workflow: {workflow_info['key']}") + except Exception as e: + logger.error(f"Failed to parse workflow {source_name}/{filename}: {e}") + + # Sort by key (source/name) + return sorted(workflows, key=lambda w: w["key"]) + + async def __call__( + self, + prompt: str, + workflow: Optional[str] = None, + # Media type specification (required for proper handling) + media_type: str = "image", # "image" or "video" + # ComfyUI connection (optional overrides) + comfyui_url: Optional[str] = None, + runninghub_api_key: Optional[str] = None, + # Common workflow parameters + width: Optional[int] = None, + height: Optional[int] = None, + negative_prompt: Optional[str] = None, + steps: Optional[int] = None, + seed: Optional[int] = None, + cfg: Optional[float] = None, + sampler: Optional[str] = None, + **params + ) -> MediaResult: + """ + Generate media (image or video) using workflow + + Media type must be specified explicitly via media_type parameter. + Returns a MediaResult object containing media type and URL. + + Args: + prompt: Media generation prompt + workflow: Workflow filename (default: from config or "image_flux.json") + media_type: Type of media to generate - "image" or "video" (default: "image") + comfyui_url: ComfyUI URL (optional, overrides config) + runninghub_api_key: RunningHub API key (optional, overrides config) + width: Media width + height: Media height + negative_prompt: Negative prompt + steps: Sampling steps + seed: Random seed + cfg: CFG scale + sampler: Sampler name + **params: Additional workflow parameters + + Returns: + MediaResult object with media_type ("image" or "video") and url + + Examples: + # Simplest: use default workflow (workflows/image_flux.json) + media = await pixelle_video.media(prompt="a beautiful cat") + if media.is_image: + print(f"Image: {media.url}") + + # Use specific workflow + media = await pixelle_video.media( + prompt="a cat", + workflow="image_flux.json" + ) + + # Video workflow + media = await pixelle_video.media( + prompt="a cat running", + workflow="image_video.json" + ) + if media.is_video: + print(f"Video: {media.url}, duration: {media.duration}s") + + # With additional parameters + media = await pixelle_video.media( + prompt="a cat", + workflow="image_flux.json", + width=1024, + height=1024, + steps=20, + seed=42 + ) + + # With absolute path + media = await pixelle_video.media( + prompt="a cat", + workflow="/path/to/custom.json" + ) + + # With custom ComfyUI server + media = await pixelle_video.media( + prompt="a cat", + comfyui_url="http://192.168.1.100:8188" + ) + """ + # 1. Resolve workflow (returns structured info) + workflow_info = self._resolve_workflow(workflow=workflow) + + # 2. Prepare ComfyKit config (supports both selfhost and runninghub) + kit_config = self._prepare_comfykit_config( + comfyui_url=comfyui_url, + runninghub_api_key=runninghub_api_key + ) + + # 3. Build workflow parameters + workflow_params = {"prompt": prompt} + + # Add optional parameters + if width is not None: + workflow_params["width"] = width + if height is not None: + workflow_params["height"] = height + if negative_prompt is not None: + workflow_params["negative_prompt"] = negative_prompt + if steps is not None: + workflow_params["steps"] = steps + if seed is not None: + workflow_params["seed"] = seed + if cfg is not None: + workflow_params["cfg"] = cfg + if sampler is not None: + workflow_params["sampler"] = sampler + + # Add any additional parameters + workflow_params.update(params) + + logger.debug(f"Workflow parameters: {workflow_params}") + + # 4. Execute workflow (ComfyKit auto-detects based on input type) + try: + kit = ComfyKit(**kit_config) + + # Determine what to pass to ComfyKit based on source + if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info: + # RunningHub: pass workflow_id (ComfyKit will use runninghub backend) + workflow_input = workflow_info["workflow_id"] + logger.info(f"Executing RunningHub workflow: {workflow_input}") + else: + # Selfhost: pass file path (ComfyKit will use local ComfyUI) + workflow_input = workflow_info["path"] + logger.info(f"Executing selfhost workflow: {workflow_input}") + + result = await kit.execute(workflow_input, workflow_params) + + # 5. Handle result based on specified media_type + if result.status != "completed": + error_msg = result.msg or "Unknown error" + logger.error(f"Media generation failed: {error_msg}") + raise Exception(f"Media generation failed: {error_msg}") + + # Extract media based on specified type + if media_type == "video": + # Video workflow - get video from result + if not result.videos: + logger.error("No video generated (workflow returned no videos)") + raise Exception("No video generated") + + video_url = result.videos[0] + logger.info(f"✅ Generated video: {video_url}") + + # Try to extract duration from result (if available) + duration = None + if hasattr(result, 'duration') and result.duration: + duration = result.duration + + return MediaResult( + media_type="video", + url=video_url, + duration=duration + ) + else: # image + # Image workflow - get image from result + if not result.images: + logger.error("No image generated (workflow returned no images)") + raise Exception("No image generated") + + image_url = result.images[0] + logger.info(f"✅ Generated image: {image_url}") + + return MediaResult( + media_type="image", + url=image_url + ) + + except Exception as e: + logger.error(f"Media generation error: {e}") + raise diff --git a/pixelle_video/services/video.py b/pixelle_video/services/video.py index d9e8a8b..35e7a56 100644 --- a/pixelle_video/services/video.py +++ b/pixelle_video/services/video.py @@ -239,6 +239,51 @@ class VideoService: logger.error(f"FFmpeg concat filter error: {error_msg}") raise RuntimeError(f"Failed to concatenate videos: {error_msg}") + def _get_video_duration(self, video: str) -> float: + """Get video duration in seconds""" + try: + probe = ffmpeg.probe(video) + duration = float(probe['format']['duration']) + return duration + except Exception as e: + logger.warning(f"Failed to get video duration: {e}") + return 0.0 + + def _get_audio_duration(self, audio: str) -> float: + """Get audio duration in seconds""" + try: + probe = ffmpeg.probe(audio) + duration = float(probe['format']['duration']) + return duration + except Exception as e: + logger.warning(f"Failed to get audio duration: {e}, using estimate") + # Fallback: estimate based on file size (very rough) + import os + file_size = os.path.getsize(audio) + # Assume ~16kbps for MP3, so 2KB per second + estimated_duration = file_size / 2000 + return max(1.0, estimated_duration) # At least 1 second + + def has_audio_stream(self, video: str) -> bool: + """ + Check if video has audio stream + + Args: + video: Video file path + + Returns: + True if video has audio stream, False otherwise + """ + try: + probe = ffmpeg.probe(video) + audio_streams = [s for s in probe.get('streams', []) if s['codec_type'] == 'audio'] + has_audio = len(audio_streams) > 0 + logger.debug(f"Video {video} has_audio={has_audio}") + return has_audio + except Exception as e: + logger.warning(f"Failed to probe video audio streams: {e}, assuming no audio") + return False + def merge_audio_video( self, video: str, @@ -247,9 +292,18 @@ class VideoService: replace_audio: bool = True, audio_volume: float = 1.0, video_volume: float = 0.0, + pad_strategy: str = "freeze", # "freeze" (freeze last frame) or "black" (black screen) ) -> str: """ - Merge audio with video + Merge audio with video, using the longer duration + + The output video duration will be the maximum of video and audio duration. + If audio is longer than video, the video will be padded using the specified strategy. + + Automatically handles videos with or without audio streams. + - If video has no audio: adds the audio track + - If video has audio and replace_audio=True: replaces with new audio + - If video has audio and replace_audio=False: mixes both audio tracks Args: video: Video file path @@ -259,6 +313,9 @@ class VideoService: audio_volume: Volume of the new audio (0.0 to 1.0+) video_volume: Volume of original video audio (0.0 to 1.0+) Only used when replace_audio=False + pad_strategy: Strategy to pad video if audio is longer + - "freeze": Freeze last frame (default) + - "black": Fill with black screen Returns: Path to the output video file @@ -267,28 +324,110 @@ class VideoService: RuntimeError: If FFmpeg execution fails Note: - - When replace_audio=True, video's original audio is removed - - When replace_audio=False, original and new audio are mixed - - Audio is trimmed/extended to match video duration + - Uses the longer duration between video and audio + - When audio is longer, video is padded using pad_strategy + - When video is longer, audio is looped or extended + - Automatically detects if video has audio + - When video is silent, audio is added regardless of replace_audio + - When replace_audio=True and video has audio, original audio is removed + - When replace_audio=False and video has audio, original and new audio are mixed """ + # Get durations of video and audio + video_duration = self._get_video_duration(video) + audio_duration = self._get_audio_duration(audio) + + logger.info(f"Video duration: {video_duration:.2f}s, Audio duration: {audio_duration:.2f}s") + + # Determine target duration (max of both) + target_duration = max(video_duration, audio_duration) + logger.info(f"Target output duration: {target_duration:.2f}s") + + # Check if video has audio stream + video_has_audio = self.has_audio_stream(video) + + # Prepare video stream (potentially with padding) + input_video = ffmpeg.input(video) + video_stream = input_video.video + + # Pad video if audio is longer + if audio_duration > video_duration: + pad_duration = audio_duration - video_duration + logger.info(f"Audio is longer, padding video by {pad_duration:.2f}s using '{pad_strategy}' strategy") + + if pad_strategy == "freeze": + # Freeze last frame: tpad filter + video_stream = video_stream.filter('tpad', stop_mode='clone', stop_duration=pad_duration) + else: # black + # Generate black frames for padding duration + from pixelle_video.utils.os_util import get_temp_path + import os + + # Get video properties + probe = ffmpeg.probe(video) + video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video') + width = int(video_info['width']) + height = int(video_info['height']) + fps_str = video_info['r_frame_rate'] + fps_num, fps_den = map(int, fps_str.split('/')) + fps = fps_num / fps_den if fps_den != 0 else 30 + + # Create black video for padding + black_video_path = get_temp_path(f"black_pad_{os.path.basename(output)}") + black_input = ffmpeg.input( + f'color=c=black:s={width}x{height}:r={fps}', + f='lavfi', + t=pad_duration + ) + + # Concatenate original video with black padding + video_stream = ffmpeg.concat(video_stream, black_input.video, v=1, a=0) + + # Prepare audio stream + input_audio = ffmpeg.input(audio) + audio_stream = input_audio.audio.filter('volume', audio_volume) + + if not video_has_audio: + logger.info(f"Video has no audio stream, adding audio track") + # Video is silent, just add the audio + try: + ( + ffmpeg + .output( + video_stream, + audio_stream, + output, + vcodec='libx264', # Re-encode video if padded + acodec='aac', + audio_bitrate='192k', + t=target_duration # Trim to target duration + ) + .overwrite_output() + .run(capture_stdout=True, capture_stderr=True) + ) + + logger.success(f"Audio added to silent video: {output}") + return output + except ffmpeg.Error as e: + error_msg = e.stderr.decode() if e.stderr else str(e) + logger.error(f"FFmpeg error adding audio to silent video: {error_msg}") + raise RuntimeError(f"Failed to add audio to video: {error_msg}") + + # Video has audio, proceed with merging logger.info(f"Merging audio with video (replace={replace_audio})") try: - input_video = ffmpeg.input(video) - input_audio = ffmpeg.input(audio) - if replace_audio: # Replace audio: use only new audio, ignore original ( ffmpeg .output( - input_video.video, - input_audio.audio.filter('volume', audio_volume), + video_stream, + audio_stream, output, - vcodec='copy', + vcodec='libx264', # Re-encode video if padded acodec='aac', audio_bitrate='192k', - shortest=None + t=target_duration # Trim to target duration ) .overwrite_output() .run(capture_stdout=True, capture_stderr=True) @@ -298,22 +437,23 @@ class VideoService: mixed_audio = ffmpeg.filter( [ input_video.audio.filter('volume', video_volume), - input_audio.audio.filter('volume', audio_volume) + audio_stream ], 'amix', inputs=2, - duration='first' + duration='longest' # Use longest audio ) ( ffmpeg .output( - input_video.video, + video_stream, mixed_audio, output, - vcodec='copy', + vcodec='libx264', # Re-encode video if padded acodec='aac', - audio_bitrate='192k' + audio_bitrate='192k', + t=target_duration # Trim to target duration ) .overwrite_output() .run(capture_stdout=True, capture_stderr=True) @@ -326,6 +466,92 @@ class VideoService: logger.error(f"FFmpeg merge error: {error_msg}") raise RuntimeError(f"Failed to merge audio and video: {error_msg}") + def overlay_image_on_video( + self, + video: str, + overlay_image: str, + output: str, + scale_mode: str = "contain" + ) -> str: + """ + Overlay a transparent image on top of video + + Args: + video: Base video file path + overlay_image: Transparent overlay image path (e.g., rendered HTML with transparent background) + output: Output video file path + scale_mode: How to scale the base video to fit the overlay size + - "contain": Scale video to fit within overlay dimensions (letterbox/pillarbox) + - "cover": Scale video to cover overlay dimensions (may crop) + - "stretch": Stretch video to exact overlay dimensions + + Returns: + Path to the output video file + + Raises: + RuntimeError: If FFmpeg execution fails + + Note: + - Overlay image should have transparent background + - Video is scaled to match overlay dimensions based on scale_mode + - Final video size matches overlay image size + - Video codec is re-encoded to support overlay + """ + logger.info(f"Overlaying image on video (scale_mode={scale_mode})") + + try: + # Get overlay image dimensions + overlay_probe = ffmpeg.probe(overlay_image) + overlay_stream = next(s for s in overlay_probe['streams'] if s['codec_type'] == 'video') + overlay_width = int(overlay_stream['width']) + overlay_height = int(overlay_stream['height']) + + logger.debug(f"Overlay dimensions: {overlay_width}x{overlay_height}") + + input_video = ffmpeg.input(video) + input_overlay = ffmpeg.input(overlay_image) + + # Scale video to fit overlay size using scale_mode + if scale_mode == "contain": + # Scale to fit (letterbox/pillarbox if aspect ratio differs) + # Use scale filter with force_original_aspect_ratio=decrease and pad to center + scaled_video = ( + input_video + .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='decrease') + .filter('pad', overlay_width, overlay_height, '(ow-iw)/2', '(oh-ih)/2', color='black') + ) + elif scale_mode == "cover": + # Scale to cover (crop if aspect ratio differs) + scaled_video = ( + input_video + .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='increase') + .filter('crop', overlay_width, overlay_height) + ) + else: # stretch + # Stretch to exact dimensions + scaled_video = input_video.filter('scale', overlay_width, overlay_height) + + # Overlay the transparent image on top of the scaled video + output_stream = ffmpeg.overlay(scaled_video, input_overlay) + + ( + ffmpeg + .output(output_stream, output, + vcodec='libx264', + pix_fmt='yuv420p', + preset='medium', + crf=23) + .overwrite_output() + .run(capture_stdout=True, capture_stderr=True) + ) + + logger.success(f"Image overlaid on video: {output}") + return output + except ffmpeg.Error as e: + error_msg = e.stderr.decode() if e.stderr else str(e) + logger.error(f"FFmpeg overlay error: {error_msg}") + raise RuntimeError(f"Failed to overlay image on video: {error_msg}") + def create_video_from_image( self, image: str, diff --git a/pixelle_video/utils/content_generators.py b/pixelle_video/utils/content_generators.py index bbda711..21b4d59 100644 --- a/pixelle_video/utils/content_generators.py +++ b/pixelle_video/utils/content_generators.py @@ -321,6 +321,98 @@ async def generate_image_prompts( return all_prompts +async def generate_video_prompts( + llm_service, + narrations: List[str], + min_words: int = 30, + max_words: int = 60, + batch_size: int = 10, + max_retries: int = 3, + progress_callback: Optional[callable] = None +) -> List[str]: + """ + Generate video prompts from narrations (with batching and retry) + + Args: + llm_service: LLM service instance + narrations: List of narrations + min_words: Min video prompt length + max_words: Max video prompt length + batch_size: Max narrations per batch (default: 10) + max_retries: Max retry attempts per batch (default: 3) + progress_callback: Optional callback(completed, total, message) for progress updates + + Returns: + List of video prompts (base prompts, without prefix applied) + """ + from pixelle_video.prompts.video_generation import build_video_prompt_prompt + + logger.info(f"Generating video prompts for {len(narrations)} narrations (batch_size={batch_size})") + + # Split narrations into batches + batches = [narrations[i:i + batch_size] for i in range(0, len(narrations), batch_size)] + logger.info(f"Split into {len(batches)} batches") + + all_prompts = [] + + # Process each batch + for batch_idx, batch_narrations in enumerate(batches, 1): + logger.info(f"Processing batch {batch_idx}/{len(batches)} ({len(batch_narrations)} narrations)") + + # Retry logic for this batch + for attempt in range(1, max_retries + 1): + try: + # Generate prompts for this batch + prompt = build_video_prompt_prompt( + narrations=batch_narrations, + min_words=min_words, + max_words=max_words + ) + + response = await llm_service( + prompt=prompt, + temperature=0.7, + max_tokens=8192 + ) + + logger.debug(f"Batch {batch_idx} attempt {attempt}: LLM response length: {len(response)} chars") + + # Parse JSON + result = _parse_json(response) + + if "video_prompts" not in result: + raise KeyError("Invalid response format: missing 'video_prompts'") + + batch_prompts = result["video_prompts"] + + # Validate batch result + if len(batch_prompts) != len(batch_narrations): + raise ValueError( + f"Prompt count mismatch: expected {len(batch_narrations)}, got {len(batch_prompts)}" + ) + + # Success - add to all_prompts + all_prompts.extend(batch_prompts) + logger.info(f"✓ Batch {batch_idx} completed: {len(batch_prompts)} video prompts") + + # Report progress + if progress_callback: + completed = len(all_prompts) + total = len(narrations) + progress_callback(completed, total, f"Batch {batch_idx}/{len(batches)} completed") + + break # Success, move to next batch + + except Exception as e: + logger.warning(f"✗ Batch {batch_idx} attempt {attempt} failed: {e}") + if attempt >= max_retries: + raise + logger.info(f"Retrying batch {batch_idx}...") + + logger.info(f"✅ Generated {len(all_prompts)} video prompts") + return all_prompts + + def _parse_json(text: str) -> dict: """ Parse JSON from text, with fallback to extract JSON from markdown code blocks diff --git a/pixelle_video/utils/os_util.py b/pixelle_video/utils/os_util.py index 3538f7e..12c26dc 100644 --- a/pixelle_video/utils/os_util.py +++ b/pixelle_video/utils/os_util.py @@ -260,7 +260,7 @@ def get_task_path(task_id: str, *paths: str) -> str: def get_task_frame_path( task_id: str, frame_index: int, - file_type: Literal["audio", "image", "composed", "segment"] + file_type: Literal["audio", "image", "video", "composed", "segment"] ) -> str: """ Get frame file path within task directory @@ -268,7 +268,7 @@ def get_task_frame_path( Args: task_id: Task ID frame_index: Frame index (0-based internally, but filename starts from 01) - file_type: File type (audio/image/composed/segment) + file_type: File type (audio/image/video/composed/segment) Returns: Absolute path to frame file @@ -280,6 +280,7 @@ def get_task_frame_path( ext_map = { "audio": "mp3", "image": "png", + "video": "mp4", "composed": "png", "segment": "mp4" } diff --git a/web/app.py b/web/app.py index bb40383..8b26d2e 100644 --- a/web/app.py +++ b/web/app.py @@ -782,10 +782,29 @@ def main(): generator_for_params = HTMLFrameGenerator(template_path_for_params) custom_params_for_video = generator_for_params.parse_template_parameters() - # Detect if template requires image generation - template_requires_image = generator_for_params.requires_image() - # Store in session state for Image Section to read - st.session_state['template_requires_image'] = template_requires_image + # Detect template media type + from pathlib import Path + template_name = Path(frame_template).name + + if template_name.startswith("video_"): + # Video template + template_media_type = "video" + template_requires_media = True + elif generator_for_params.requires_image(): + # Image template + template_media_type = "image" + template_requires_media = True + else: + # Text-only template + template_media_type = "text" + template_requires_media = False + + # Store in session state for workflow filtering + st.session_state['template_media_type'] = template_media_type + st.session_state['template_requires_media'] = template_requires_media + + # Backward compatibility + st.session_state['template_requires_image'] = (template_media_type == "image") custom_values_for_video = {} if custom_params_for_video: @@ -928,25 +947,51 @@ def main(): logger.exception(e) # ==================================================================== - # Image Generation Section (conditional based on template) + # Media Generation Section (conditional based on template) # ==================================================================== - # Check if current template requires image generation - if st.session_state.get('template_requires_image', True): - # Template requires images - show full Image Section + # Check if current template requires media generation + template_media_type = st.session_state.get('template_media_type', 'image') + template_requires_media = st.session_state.get('template_requires_media', True) + + if template_requires_media: + # Template requires media - show Media Generation Section with st.container(border=True): - st.markdown(f"**{tr('section.image')}**") + # Dynamic section title based on template type + if template_media_type == "video": + section_title = tr('section.video') + else: + section_title = tr('section.image') + + st.markdown(f"**{section_title}**") # 1. ComfyUI Workflow selection with st.expander(tr("help.feature_description"), expanded=False): st.markdown(f"**{tr('help.what')}**") - st.markdown(tr("style.workflow_what")) + if template_media_type == "video": + st.markdown(tr('style.video_workflow_what')) + else: + st.markdown(tr("style.workflow_what")) st.markdown(f"**{tr('help.how')}**") - st.markdown(tr("style.workflow_how")) + if template_media_type == "video": + st.markdown(tr('style.video_workflow_how')) + else: + st.markdown(tr("style.workflow_how")) st.markdown(f"**{tr('help.note')}**") - st.markdown(tr("style.image_size_note")) + if template_media_type == "video": + st.markdown(tr('style.video_size_note')) + else: + st.markdown(tr("style.image_size_note")) - # Get available workflows from pixelle_video (with source info) - workflows = pixelle_video.image.list_workflows() + # Get available workflows and filter by template type + all_workflows = pixelle_video.media.list_workflows() + + # Filter workflows based on template media type + if template_media_type == "video": + # Only show video_ workflows + workflows = [wf for wf in all_workflows if "video_" in wf["key"].lower()] + else: + # Only show image_ workflows (exclude video_) + workflows = [wf for wf in all_workflows if "video_" not in wf["key"].lower()] # Build options for selectbox # Display: "image_flux.json - Runninghub" @@ -979,25 +1024,39 @@ def main(): workflow_key = "runninghub/image_flux.json" # fallback - # 2. Image size input + # 2. Media size input col1, col2 = st.columns(2) with col1: + if template_media_type == "video": + width_label = tr('style.video_width') + width_help = tr('style.video_width_help') + else: + width_label = tr('style.image_width') + width_help = tr('style.image_width_help') + image_width = st.number_input( - tr('style.image_width'), + width_label, min_value=128, value=1024, step=1, label_visibility="visible", - help=tr('style.image_width_help') + help=width_help ) with col2: + if template_media_type == "video": + height_label = tr('style.video_height') + height_help = tr('style.video_height_help') + else: + height_label = tr('style.image_height') + height_help = tr('style.image_height_help') + image_height = st.number_input( - tr('style.image_height'), + height_label, min_value=128, value=1024, step=1, label_visibility="visible", - help=tr('style.image_height_help') + help=height_help ) # 3. Prompt prefix input @@ -1014,54 +1073,71 @@ def main(): help=tr("style.prompt_prefix_help") ) - # Style preview expander (similar to template preview) - with st.expander(tr("style.preview_title"), expanded=False): + # Media preview expander + preview_title = tr("style.video_preview_title") if template_media_type == "video" else tr("style.preview_title") + with st.expander(preview_title, expanded=False): # Test prompt input + if template_media_type == "video": + test_prompt_label = tr("style.test_video_prompt") + test_prompt_value = "a dog running in the park" + else: + test_prompt_label = tr("style.test_prompt") + test_prompt_value = "a dog" + test_prompt = st.text_input( - tr("style.test_prompt"), - value="a dog", + test_prompt_label, + value=test_prompt_value, help=tr("style.test_prompt_help"), key="style_test_prompt" ) # Preview button - if st.button(tr("style.preview"), key="preview_style", use_container_width=True): - with st.spinner(tr("style.previewing")): + preview_button_label = tr("style.video_preview") if template_media_type == "video" else tr("style.preview") + if st.button(preview_button_label, key="preview_style", use_container_width=True): + previewing_text = tr("style.video_previewing") if template_media_type == "video" else tr("style.previewing") + with st.spinner(previewing_text): try: from pixelle_video.utils.prompt_helper import build_image_prompt # Build final prompt with prefix final_prompt = build_image_prompt(test_prompt, prompt_prefix) - # Generate preview image (use user-specified size) - preview_image_path = run_async(pixelle_video.image( + # Generate preview media (use user-specified size and media type) + media_result = run_async(pixelle_video.media( prompt=final_prompt, workflow=workflow_key, + media_type=template_media_type, width=int(image_width), height=int(image_height) )) + preview_media_path = media_result.url # Display preview (support both URL and local path) - if preview_image_path: - st.success(tr("style.preview_success")) + if preview_media_path: + success_text = tr("style.video_preview_success") if template_media_type == "video" else tr("style.preview_success") + st.success(success_text) - # Read and encode image - if preview_image_path.startswith('http'): - # URL - use directly - img_html = f'