diff --git a/README.md b/README.md index bdfba38..5a029d5 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ 只需输入一个 **主题**,Pixelle-Video 就能自动完成: - ✍️ 撰写视频文案 -- 🎨 生成 AI 配图 +- 🎨 生成 AI 配图/视频 - 🗣️ 合成语音解说 - 🎵 添加背景音乐 - 🎬 一键合成视频 @@ -32,6 +32,7 @@ - ✅ **全自动生成** - 输入主题,自动生成完整视频 - ✅ **AI 智能文案** - 根据主题智能创作解说词,无需自己写脚本 - ✅ **AI 生成配图** - 每句话都配上精美的 AI 插图 +- ✅ **AI 生成视频** - 支持使用 AI 视频生成模型(如 WAN 2.1)创建动态视频内容 - ✅ **AI 生成语音** - 支持 Edge-TTS、Index-TTS 等众多主流 TTS 方案 - ✅ **背景音乐** - 支持添加 BGM,让视频更有氛围 - ✅ **视觉风格** - 多种模板可选,打造独特视频风格 @@ -281,6 +282,12 @@ uv run streamlit run web/app.py #### 视频模板 决定视频画面的布局和设计。 +**模板命名规范** +- `static_*.html`: 静态模板(无需AI生成媒体,纯文字样式) +- `image_*.html`: 图片模板(使用AI生成的图片作为背景) +- `video_*.html`: 视频模板(使用AI生成的视频作为背景) + +**使用方法** - 从下拉菜单选择模板,按尺寸分组显示(竖屏/横屏/方形) - 点击「预览模板」可以自定义参数测试效果 - 如果懂 HTML,可以在 `templates/` 文件夹创建自己的模板 diff --git a/README_EN.md b/README_EN.md index 370bcc5..b8395b9 100644 --- a/README_EN.md +++ b/README_EN.md @@ -13,7 +13,7 @@ Just input a **topic**, and Pixelle-Video will automatically: - ✍️ Write video script -- 🎨 Generate AI images +- 🎨 Generate AI images/videos - 🗣️ Synthesize voice narration - 🎵 Add background music - 🎬 Create video with one click @@ -32,6 +32,7 @@ Just input a **topic**, and Pixelle-Video will automatically: - ✅ **Fully Automatic Generation** - Input a topic, automatically generate complete video - ✅ **AI Smart Copywriting** - Intelligently create narration based on topic, no need to write scripts yourself - ✅ **AI Generated Images** - Each sentence comes with beautiful AI illustrations +- ✅ **AI Generated Videos** - Support AI video generation models (like WAN 2.1) to create dynamic video content - ✅ **AI Generated Voice** - Support Edge-TTS, Index-TTS and many other mainstream TTS solutions - ✅ **Background Music** - Support adding BGM to make videos more atmospheric - ✅ **Visual Styles** - Multiple templates to choose from, create unique video styles @@ -281,6 +282,12 @@ Determine what style of images AI generates. #### Video Template Determines video layout and design. +**Template Naming Convention** +- `static_*.html`: Static templates (no AI-generated media, text-only styles) +- `image_*.html`: Image templates (uses AI-generated images as background) +- `video_*.html`: Video templates (uses AI-generated videos as background) + +**Usage** - Select template from dropdown menu, displayed grouped by dimension (portrait/landscape/square) - Click "Preview Template" to test effect with custom parameters - If you know HTML, you can create your own templates in the `templates/` folder diff --git a/api/routers/image.py b/api/routers/image.py index 92f23b1..f9bd460 100644 --- a/api/routers/image.py +++ b/api/routers/image.py @@ -43,18 +43,27 @@ async def image_generate( try: logger.info(f"Image generation request: {request.prompt[:50]}...") - # Call image service - image_path = await pixelle_video.image( + # Call media service (backward compatible with image API) + media_result = await pixelle_video.media( prompt=request.prompt, width=request.width, height=request.height, workflow=request.workflow ) + # For backward compatibility, only support image results in /image endpoint + if media_result.is_video: + raise HTTPException( + status_code=400, + detail="Video workflow used. Please use /media/generate endpoint for video generation." + ) + return ImageGenerateResponse( - image_path=image_path + image_path=media_result.url ) + except HTTPException: + raise except Exception as e: logger.error(f"Image generation error: {e}") raise HTTPException(status_code=500, detail=str(e)) diff --git a/api/routers/video.py b/api/routers/video.py index e7a47cd..207e3c2 100644 --- a/api/routers/video.py +++ b/api/routers/video.py @@ -73,8 +73,7 @@ async def generate_video_sync( "max_narration_words": request_body.max_narration_words, "min_image_prompt_words": request_body.min_image_prompt_words, "max_image_prompt_words": request_body.max_image_prompt_words, - "image_width": request_body.image_width, - "image_height": request_body.image_height, + # Note: image_width and image_height are now auto-determined from template "image_workflow": request_body.image_workflow, "video_fps": request_body.video_fps, "frame_template": request_body.frame_template, @@ -161,8 +160,7 @@ async def generate_video_async( "max_narration_words": request_body.max_narration_words, "min_image_prompt_words": request_body.min_image_prompt_words, "max_image_prompt_words": request_body.max_image_prompt_words, - "image_width": request_body.image_width, - "image_height": request_body.image_height, + # Note: image_width and image_height are now auto-determined from template "image_workflow": request_body.image_workflow, "video_fps": request_body.video_fps, "frame_template": request_body.frame_template, diff --git a/api/schemas/video.py b/api/schemas/video.py index 93070f9..d37dd80 100644 --- a/api/schemas/video.py +++ b/api/schemas/video.py @@ -57,8 +57,7 @@ class VideoGenerateRequest(BaseModel): max_image_prompt_words: int = Field(60, ge=10, le=200, description="Max image prompt words") # === Image Parameters === - image_width: int = Field(1024, description="Image width") - image_height: int = Field(1024, description="Image height") + # Note: image_width and image_height are now auto-determined from template meta tags image_workflow: Optional[str] = Field(None, description="Custom image workflow") # === Video Parameters === diff --git a/config.example.yaml b/config.example.yaml index 0ecffec..02e68be 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -37,15 +37,29 @@ comfyui: # Image prompt prefix (optional) prompt_prefix: "Minimalist black-and-white matchstick figure style illustration, clean lines, simple sketch style" + + # Video-specific configuration + video: + # Required: Default workflow to use (no fallback) + # Options: runninghub/video_wan2.1_fusionx.json (recommended, no local setup) + # selfhost/video_wan2.1_fusionx.json (requires local ComfyUI) + default_workflow: runninghub/video_wan2.1_fusionx.json + + # Video prompt prefix (optional) + prompt_prefix: "Minimalist black-and-white matchstick figure style illustration, clean lines, simple sketch style" # ==================== Template Configuration ==================== # Configure default template for video generation template: # Default frame template to use when not explicitly specified # Determines video aspect ratio and layout style + # Template naming convention: + # - static_*.html: Static style templates (no AI-generated media) + # - image_*.html: Templates requiring AI-generated images + # - video_*.html: Templates requiring AI-generated videos # Options: - # - 1080x1920 (vertical/portrait): default.html, modern.html, elegant.html, etc. - # - 1080x1080 (square): minimal_framed.html, magazine_cover.html, etc. - # - 1920x1080 (horizontal/landscape): film.html, full.html, etc. + # - 1080x1920 (vertical/portrait): image_default.html, image_modern.html, image_elegant.html, static_simple.html, etc. + # - 1080x1080 (square): image_minimal_framed.html, etc. + # - 1920x1080 (horizontal/landscape): image_film.html, image_full.html, etc. # See templates/ directory for all available templates - default_template: "1080x1920/default.html" + default_template: "1080x1920/image_default.html" diff --git a/docs/en/reference/config-schema.md b/docs/en/reference/config-schema.md index 32ac8a3..60767dd 100644 --- a/docs/en/reference/config-schema.md +++ b/docs/en/reference/config-schema.md @@ -21,6 +21,10 @@ comfyui: default_workflow: "runninghub/image_flux.json" prompt_prefix: "Minimalist illustration style" + video: + default_workflow: "runninghub/video_wan2.1_fusionx.json" + prompt_prefix: "Minimalist illustration style" + tts: default_workflow: "selfhost/tts_edge.json" ``` @@ -48,6 +52,13 @@ comfyui: - `default_workflow`: Default image generation workflow - `prompt_prefix`: Prompt prefix +### Video Configuration + +- `default_workflow`: Default video generation workflow + - `runninghub/video_wan2.1_fusionx.json`: Cloud workflow (recommended, no local setup required) + - `selfhost/video_wan2.1_fusionx.json`: Local workflow (requires local ComfyUI support) +- `prompt_prefix`: Video prompt prefix (controls video generation style) + ### TTS Configuration - `default_workflow`: Default TTS workflow diff --git a/docs/en/user-guide/templates.md b/docs/en/user-guide/templates.md index 5d6a018..5d1e8d2 100644 --- a/docs/en/user-guide/templates.md +++ b/docs/en/user-guide/templates.md @@ -154,15 +154,39 @@ Suitable for Instagram, WeChat Moments, and other platforms. --- +## Template Naming Convention + +Templates follow a unified naming convention to distinguish different types: + +- **`static_*.html`**: Static templates + - No AI-generated media content required + - Pure text style rendering + - Suitable for quick generation and low-cost scenarios + +- **`image_*.html`**: Image templates + - Uses AI-generated images as background + - Invokes ComfyUI image generation workflows + - Suitable for content requiring visual illustrations + +- **`video_*.html`**: Video templates + - Uses AI-generated videos as background + - Invokes ComfyUI video generation workflows + - Creates dynamic video content with enhanced expressiveness + ## Template Structure Templates are located in the `templates/` directory, grouped by size: ``` templates/ -├── 1080x1920/ # Portrait (11 templates) -├── 1920x1080/ # Landscape (2 templates) -└── 1080x1080/ # Square (1 template) +├── 1080x1920/ # Portrait +│ ├── static_*.html # Static templates +│ ├── image_*.html # Image templates +│ └── video_*.html # Video templates +├── 1920x1080/ # Landscape +│ └── image_*.html # Image templates +└── 1080x1080/ # Square + └── image_*.html # Image templates ``` --- diff --git a/docs/en/user-guide/workflows.md b/docs/en/user-guide/workflows.md index 0538376..a4fec9d 100644 --- a/docs/en/user-guide/workflows.md +++ b/docs/en/user-guide/workflows.md @@ -16,10 +16,42 @@ Pixelle-Video is built on the ComfyUI architecture and supports custom workflows Located in `workflows/selfhost/` or `workflows/runninghub/` +Used for Text-to-Speech, supporting various TTS engines: +- Edge-TTS +- Index-TTS (supports voice cloning) +- Other ComfyUI-compatible TTS nodes + ### Image Generation Workflows Located in `workflows/selfhost/` or `workflows/runninghub/` +Used for generating static images as video backgrounds: +- FLUX series models +- Stable Diffusion series models +- Other image generation models + +### Video Generation Workflows + +Located in `workflows/selfhost/` or `workflows/runninghub/` + +**New Feature**: Supports AI video generation to create dynamic video content. + +**Preset Workflows**: +- `runninghub/video_wan2.1_fusionx.json`: Cloud workflow (recommended) + - Based on WAN 2.1 model + - No local setup required, accessed via RunningHub API + - Supports Text-to-Video generation + +- `selfhost/video_wan2.1_fusionx.json`: Local workflow + - Requires local ComfyUI environment + - Requires installation of corresponding video generation nodes + - Suitable for users with local GPU + +**Use Cases**: +- Works with `video_*.html` templates +- Automatically generates dynamic video backgrounds based on scripts +- Enhances visual expressiveness and viewing experience + --- ## Custom Workflows diff --git a/docs/zh/reference/config-schema.md b/docs/zh/reference/config-schema.md index b917771..fb6c8f5 100644 --- a/docs/zh/reference/config-schema.md +++ b/docs/zh/reference/config-schema.md @@ -21,6 +21,10 @@ comfyui: default_workflow: "runninghub/image_flux.json" prompt_prefix: "Minimalist illustration style" + video: + default_workflow: "runninghub/video_wan2.1_fusionx.json" + prompt_prefix: "Minimalist illustration style" + tts: default_workflow: "selfhost/tts_edge.json" ``` @@ -48,6 +52,13 @@ comfyui: - `default_workflow`: 默认图像生成工作流 - `prompt_prefix`: 提示词前缀 +### 视频配置 + +- `default_workflow`: 默认视频生成工作流 + - `runninghub/video_wan2.1_fusionx.json`: 云端工作流(推荐,无需本地环境) + - `selfhost/video_wan2.1_fusionx.json`: 本地工作流(需要本地 ComfyUI 支持) +- `prompt_prefix`: 视频提示词前缀(用于控制视频生成风格) + ### TTS 配置 - `default_workflow`: 默认 TTS 工作流 diff --git a/docs/zh/user-guide/templates.md b/docs/zh/user-guide/templates.md index bbf4887..7d9a976 100644 --- a/docs/zh/user-guide/templates.md +++ b/docs/zh/user-guide/templates.md @@ -154,15 +154,39 @@ --- +## 模板命名规范 + +模板采用统一的命名规范来区分不同类型: + +- **`static_*.html`**: 静态模板 + - 无需 AI 生成任何媒体内容 + - 纯文字样式渲染 + - 适合快速生成、低成本场景 + +- **`image_*.html`**: 图片模板 + - 使用 AI 生成的图片作为背景 + - 调用 ComfyUI 的图像生成工作流 + - 适合需要视觉配图的内容 + +- **`video_*.html`**: 视频模板 + - 使用 AI 生成的视频作为背景 + - 调用 ComfyUI 的视频生成工作流 + - 创建动态视频内容,增强表现力 + ## 模板结构 模板位于 `templates/` 目录,按尺寸分组: ``` templates/ -├── 1080x1920/ # 竖屏(11个模板) -├── 1920x1080/ # 横屏(2个模板) -└── 1080x1080/ # 方形(1个模板) +├── 1080x1920/ # 竖屏 +│ ├── static_*.html # 静态模板 +│ ├── image_*.html # 图片模板 +│ └── video_*.html # 视频模板 +├── 1920x1080/ # 横屏 +│ └── image_*.html # 图片模板 +└── 1080x1080/ # 方形 + └── image_*.html # 图片模板 ``` --- diff --git a/docs/zh/user-guide/workflows.md b/docs/zh/user-guide/workflows.md index a63e324..40412e1 100644 --- a/docs/zh/user-guide/workflows.md +++ b/docs/zh/user-guide/workflows.md @@ -16,10 +16,42 @@ Pixelle-Video 基于 ComfyUI 架构,支持自定义工作流。 位于 `workflows/selfhost/` 或 `workflows/runninghub/` +用于文本转语音(Text-to-Speech),支持多种 TTS 引擎: +- Edge-TTS +- Index-TTS(支持声音克隆) +- 其他 ComfyUI 兼容的 TTS 节点 + ### 图像生成工作流 位于 `workflows/selfhost/` 或 `workflows/runninghub/` +用于生成静态图像作为视频背景: +- FLUX 系列模型 +- Stable Diffusion 系列模型 +- 其他图像生成模型 + +### 视频生成工作流 + +位于 `workflows/selfhost/` 或 `workflows/runninghub/` + +**新功能**:支持 AI 视频生成,创建动态视频内容。 + +**预置工作流**: +- `runninghub/video_wan2.1_fusionx.json`: 云端工作流(推荐) + - 基于 WAN 2.1 模型 + - 无需本地环境,通过 RunningHub API 调用 + - 支持文本到视频(Text-to-Video) + +- `selfhost/video_wan2.1_fusionx.json`: 本地工作流 + - 需要本地 ComfyUI 环境 + - 需要安装相应的视频生成节点 + - 适合有本地 GPU 的用户 + +**使用场景**: +- 配合 `video_*.html` 模板使用 +- 自动根据文案生成动态视频背景 +- 增强视频的视觉表现力和观看体验 + --- ## 自定义工作流 diff --git a/pixelle_video/models/media.py b/pixelle_video/models/media.py new file mode 100644 index 0000000..2c7eca9 --- /dev/null +++ b/pixelle_video/models/media.py @@ -0,0 +1,61 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Media generation result models +""" + +from typing import Literal, Optional +from pydantic import BaseModel, Field + + +class MediaResult(BaseModel): + """ + Media generation result from workflow execution + + Supports both image and video outputs from ComfyUI workflows. + The media_type indicates what kind of media was generated. + + Attributes: + media_type: Type of media generated ("image" or "video") + url: URL or path to the generated media + duration: Duration in seconds (only for video, None for image) + + Examples: + # Image result + MediaResult(media_type="image", url="http://example.com/image.png") + + # Video result + MediaResult(media_type="video", url="http://example.com/video.mp4", duration=5.2) + """ + + media_type: Literal["image", "video"] = Field( + description="Type of generated media" + ) + url: str = Field( + description="URL or path to the generated media file" + ) + duration: Optional[float] = Field( + None, + description="Duration in seconds (only applicable for video)" + ) + + @property + def is_image(self) -> bool: + """Check if this is an image result""" + return self.media_type == "image" + + @property + def is_video(self) -> bool: + """Check if this is a video result""" + return self.media_type == "video" + diff --git a/pixelle_video/models/storyboard.py b/pixelle_video/models/storyboard.py index 6ef3f7d..1204991 100644 --- a/pixelle_video/models/storyboard.py +++ b/pixelle_video/models/storyboard.py @@ -57,16 +57,18 @@ class StoryboardFrame: """Single storyboard frame""" index: int # Frame index (0-based) narration: str # Narration text - image_prompt: str # Image generation prompt + image_prompt: str # Image generation prompt (can be None for text-only or video) # Generated resource paths - audio_path: Optional[str] = None # Audio file path - image_path: Optional[str] = None # Original image path - composed_image_path: Optional[str] = None # Composed image path (with subtitles) - video_segment_path: Optional[str] = None # Video segment path + audio_path: Optional[str] = None # Audio file path (narration) + media_type: Optional[str] = None # Media type: "image" or "video" (None if no media) + image_path: Optional[str] = None # Original image path (for image type) + video_path: Optional[str] = None # Original video path (for video type, before composition) + composed_image_path: Optional[str] = None # Composed image path (with subtitles, for image type) + video_segment_path: Optional[str] = None # Final video segment path # Metadata - duration: float = 0.0 # Audio duration (seconds) + duration: float = 0.0 # Frame duration (seconds, from audio or video) created_at: Optional[datetime] = None def __post_init__(self): diff --git a/pixelle_video/pipelines/base.py b/pixelle_video/pipelines/base.py index 59493cd..b511e48 100644 --- a/pixelle_video/pipelines/base.py +++ b/pixelle_video/pipelines/base.py @@ -63,8 +63,11 @@ class BasePipeline(ABC): # Quick access to services (convenience) self.llm = pixelle_video_core.llm self.tts = pixelle_video_core.tts - self.image = pixelle_video_core.image + self.media = pixelle_video_core.media self.video = pixelle_video_core.video + + # Backward compatibility alias + self.image = pixelle_video_core.media @abstractmethod async def __call__( diff --git a/pixelle_video/pipelines/custom.py b/pixelle_video/pipelines/custom.py index e1779c4..a453de4 100644 --- a/pixelle_video/pipelines/custom.py +++ b/pixelle_video/pipelines/custom.py @@ -92,8 +92,7 @@ class CustomPipeline(BasePipeline): ref_audio: Optional[str] = None, image_workflow: Optional[str] = None, - image_width: int = 1024, - image_height: int = 1024, + # Note: image_width and image_height are now auto-determined from template frame_template: Optional[str] = None, video_fps: int = 30, @@ -118,9 +117,10 @@ class CustomPipeline(BasePipeline): VideoGenerationResult Image Generation Logic: - - If template has {{image}} → automatically generates images - - If template has no {{image}} → skips image generation (faster, cheaper) - - To customize: Override the template_requires_image logic in your subclass + - image_*.html templates → automatically generates images + - video_*.html templates → automatically generates videos + - static_*.html templates → skips media generation (faster, cheaper) + - To customize: Override the template type detection logic in your subclass """ logger.info("Starting CustomPipeline") logger.info(f"Input text length: {len(text)} chars") @@ -152,19 +152,27 @@ class CustomPipeline(BasePipeline): frame_template = template_config.get("default_template", "1080x1920/default.html") # ========== Step 0.5: Check template requirements ========== - # Detect if template requires {{image}} parameter - # This allows skipping the entire image generation pipeline for text-only templates + # Detect template type by filename prefix + from pathlib import Path from pixelle_video.services.frame_html import HTMLFrameGenerator - from pixelle_video.utils.template_util import resolve_template_path + from pixelle_video.utils.template_util import resolve_template_path, get_template_type + template_name = Path(frame_template).name + template_type = get_template_type(template_name) + template_requires_image = (template_type == "image") + + # Read media size from template meta tags template_path = resolve_template_path(frame_template) generator = HTMLFrameGenerator(template_path) - template_requires_image = generator.requires_image() + image_width, image_height = generator.get_media_size() + logger.info(f"📐 Media size from template: {image_width}x{image_height}") - if template_requires_image: + if template_type == "image": logger.info(f"📸 Template requires image generation") - else: - logger.info(f"⚡ Template does not require images - skipping image generation pipeline") + elif template_type == "video": + logger.info(f"🎬 Template requires video generation") + else: # static + logger.info(f"⚡ Static template - skipping media generation pipeline") logger.info(f" 💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency") # ========== Step 1: Process content (CUSTOMIZE THIS) ========== @@ -194,8 +202,8 @@ class CustomPipeline(BasePipeline): # ========== Step 2: Generate image prompts (CONDITIONAL - CUSTOMIZE THIS) ========== self._report_progress(progress_callback, "generating_image_prompts", 0.25) - # IMPORTANT: Check if template actually needs images - # If your template doesn't use {{image}}, you can skip this entire step! + # IMPORTANT: Check if template is image type + # If your template is static_*.html, you can skip this entire step! if template_requires_image: # Template requires images - generate image prompts using LLM from pixelle_video.utils.content_generators import generate_image_prompts diff --git a/pixelle_video/pipelines/standard.py b/pixelle_video/pipelines/standard.py index 44af54c..fceedfb 100644 --- a/pixelle_video/pipelines/standard.py +++ b/pixelle_video/pipelines/standard.py @@ -94,8 +94,7 @@ class StandardPipeline(BasePipeline): max_image_prompt_words: int = 60, # === Image Parameters === - image_width: int = 1024, - image_height: int = 1024, + # Note: image_width and image_height are now auto-determined from template meta tags image_workflow: Optional[str] = None, # === Video Parameters === @@ -151,9 +150,8 @@ class StandardPipeline(BasePipeline): min_image_prompt_words: Min image prompt length max_image_prompt_words: Max image prompt length - image_width: Generated image width (default 1024) - image_height: Generated image height (default 1024) image_workflow: Image workflow filename (e.g., "image_flux.json", None = use default) + Note: Image/video size is now auto-determined from template meta tags video_fps: Video frame rate (default 30) @@ -239,6 +237,16 @@ class StandardPipeline(BasePipeline): template_config = self.core.config.get("template", {}) frame_template = template_config.get("default_template", "1080x1920/default.html") + # Read media size from template meta tags + from pixelle_video.services.frame_html import HTMLFrameGenerator + from pixelle_video.utils.template_util import resolve_template_path + + template_path = resolve_template_path(frame_template) + temp_generator = HTMLFrameGenerator(template_path) + image_width, image_height = temp_generator.get_media_size() + + logger.info(f"📐 Media size from template: {image_width}x{image_height}") + # Create storyboard config config = StoryboardConfig( task_id=task_id, @@ -269,11 +277,13 @@ class StandardPipeline(BasePipeline): ) # ========== Step 0.8: Check template requirements ========== - template_requires_image = self._check_template_requires_image(config.frame_template) - if template_requires_image: + template_media_type = self._check_template_media_type(config.frame_template) + if template_media_type == "video": + logger.info(f"🎬 Template requires video generation") + elif template_media_type == "image": logger.info(f"📸 Template requires image generation") - else: - logger.info(f"⚡ Template does not require images - skipping image generation pipeline") + else: # static + logger.info(f"⚡ Static template - skipping media generation pipeline") logger.info(f" 💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency") try: @@ -294,8 +304,61 @@ class StandardPipeline(BasePipeline): logger.info(f"✅ Split script into {len(narrations)} segments (by lines)") logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode") - # ========== Step 2: Generate image prompts (conditional) ========== - if template_requires_image: + # ========== Step 2: Generate media prompts (conditional) ========== + if template_media_type == "video": + # Video template: generate video prompts + self._report_progress(progress_callback, "generating_video_prompts", 0.15) + + from pixelle_video.utils.content_generators import generate_video_prompts + + # Override prompt_prefix if provided + original_prefix = None + if prompt_prefix is not None: + image_config = self.core.config.get("comfyui", {}).get("image", {}) + original_prefix = image_config.get("prompt_prefix") + image_config["prompt_prefix"] = prompt_prefix + logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'") + + try: + # Create progress callback wrapper for video prompt generation + def video_prompt_progress(completed: int, total: int, message: str): + batch_progress = completed / total if total > 0 else 0 + overall_progress = 0.15 + (batch_progress * 0.15) + self._report_progress( + progress_callback, + "generating_video_prompts", + overall_progress, + extra_info=message + ) + + # Generate base video prompts + base_image_prompts = await generate_video_prompts( + self.llm, + narrations=narrations, + min_words=min_image_prompt_words, + max_words=max_image_prompt_words, + progress_callback=video_prompt_progress + ) + + # Apply prompt prefix + from pixelle_video.utils.prompt_helper import build_image_prompt + image_config = self.core.config.get("comfyui", {}).get("image", {}) + prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "") + + image_prompts = [] + for base_prompt in base_image_prompts: + final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use) + image_prompts.append(final_prompt) + + finally: + # Restore original prompt_prefix + if original_prefix is not None: + image_config["prompt_prefix"] = original_prefix + + logger.info(f"✅ Generated {len(image_prompts)} video prompts") + + elif template_media_type == "image": + # Image template: generate image prompts self._report_progress(progress_callback, "generating_image_prompts", 0.15) # Override prompt_prefix if provided @@ -343,12 +406,13 @@ class StandardPipeline(BasePipeline): image_config["prompt_prefix"] = original_prefix logger.info(f"✅ Generated {len(image_prompts)} image prompts") - else: - # Skip image prompt generation + + else: # text + # Text-only template: skip media prompt generation image_prompts = [None] * len(narrations) self._report_progress(progress_callback, "preparing_frames", 0.15) - logger.info(f"⚡ Skipped image prompt generation (template doesn't need images)") - logger.info(f" 💡 Savings: {len(narrations)} LLM calls + {len(narrations)} image generations") + logger.info(f"⚡ Skipped media prompt generation (text-only template)") + logger.info(f" 💡 Savings: {len(narrations)} LLM calls + {len(narrations)} media generations") # ========== Step 3: Create frames ========== for i, (narration, image_prompt) in enumerate(zip(narrations, image_prompts)): @@ -452,29 +516,32 @@ class StandardPipeline(BasePipeline): logger.error(f"❌ Video generation failed: {e}") raise - def _check_template_requires_image(self, frame_template: str) -> bool: + def _check_template_media_type(self, frame_template: str) -> str: """ - Check if template requires image generation + Check template media type requirement This is checked at pipeline level to avoid unnecessary: - - LLM calls (generating image_prompts) - - Image generation API calls + - LLM calls (generating media prompts) + - Media generation API calls - ComfyUI dependency + Template naming convention: + - static_*.html: Static style template (returns "static") + - image_*.html: Image template (returns "image") + - video_*.html: Video template (returns "video") + Args: - frame_template: Template path (e.g., "1080x1920/default.html") + frame_template: Template path (e.g., "1080x1920/image_default.html" or "1080x1920/video_default.html") Returns: - True if template contains {{image}}, False otherwise + "static", "image", or "video" """ - from pixelle_video.services.frame_html import HTMLFrameGenerator - from pixelle_video.utils.template_util import resolve_template_path + from pixelle_video.utils.template_util import get_template_type - template_path = resolve_template_path(frame_template) - generator = HTMLFrameGenerator(template_path) + # Determine type by template filename prefix + template_name = Path(frame_template).name + template_type = get_template_type(template_name) - requires = generator.requires_image() - logger.debug(f"Template '{frame_template}' requires_image={requires}") - - return requires + logger.debug(f"Template '{frame_template}' is {template_type} template") + return template_type diff --git a/pixelle_video/prompts/video_generation.py b/pixelle_video/prompts/video_generation.py new file mode 100644 index 0000000..f795012 --- /dev/null +++ b/pixelle_video/prompts/video_generation.py @@ -0,0 +1,133 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Video prompt generation template + +For generating video prompts from narrations. +""" + +import json +from typing import List + + +VIDEO_PROMPT_GENERATION_PROMPT = """# 角色定位 +你是一个专业的视频创意设计师,擅长为视频脚本创作富有动感和表现力的视频生成提示词,将叙述内容转化为生动的视频画面。 + +# 核心任务 +基于已有的视频脚本,为每个分镜的"旁白内容"创作对应的**英文**视频生成提示词,确保视频画面与叙述内容完美配合,通过动态画面增强观众的理解和记忆。 + +**重要:输入包含 {narrations_count} 个旁白,你必须为每个旁白都生成一个对应的视频提示词,总共输出 {narrations_count} 个视频提示词。** + +# 输入内容 +{narrations_json} + +# 输出要求 + +## 视频提示词规范 +- 语言:**必须使用英文**(用于 AI 视频生成模型) +- 描述结构:scene + character action + camera movement + emotion + atmosphere +- 描述长度:确保描述清晰完整且富有创意(建议 50-100 个英文单词) +- 动态元素:强调动作、运动、变化等动态效果 + +## 视觉创意要求 +- 每个视频都要准确反映对应旁白的具体内容和情感 +- 突出画面的动态性:角色动作、物体运动、镜头移动、场景转换等 +- 使用象征手法将抽象概念视觉化(如用流动的水代表时间流逝,用上升的阶梯代表进步等) +- 画面要表现出丰富的情感和动作,增强视觉冲击力 +- 通过镜头语言(推拉摇移)和剪辑节奏增强表现力 + +## 关键英文词汇参考 +- 动作:moving, running, flowing, transforming, growing, falling +- 镜头:camera pan, zoom in, zoom out, tracking shot, aerial view +- 转场:transition, fade in, fade out, dissolve +- 氛围:dynamic, energetic, peaceful, dramatic, mysterious +- 光影:lighting changes, shadows moving, sunlight streaming + +## 视频与文案配合原则 +- 视频要服务于文案,成为文案内容的视觉延伸 +- 避免与文案内容无关或矛盾的视觉元素 +- 选择最能增强文案说服力的动态表现方式 +- 确保观众能通过视频动态快速理解文案的核心观点 + +## 创意指导 +1. **现象描述类文案**:用动态场景表现社会现象的发生过程 +2. **原因分析类文案**:用因果关系的动态演变表现内在逻辑 +3. **影响论证类文案**:用后果场景的动态展开或对比表现影响程度 +4. **深入探讨类文案**:用抽象概念的动态具象化表现深刻思考 +5. **结论启发类文案**:用开放式动态场景或指引性运动表现启发性 + +## 视频特有注意事项 +- 强调动态:每个视频都应该包含明显的动作或运动 +- 镜头语言:适当使用推拉摇移等镜头技巧增强表现力 +- 时长考虑:视频应该是连贯的动态过程,不是静态画面 +- 流畅性:注意动作的流畅性和自然性 + +# 输出格式 +严格按照以下JSON格式输出,**视频提示词必须是英文**: + +```json +{{ + "video_prompts": [ + "[detailed English video prompt with dynamic elements and camera movements]", + "[detailed English video prompt with dynamic elements and camera movements]" + ] +}} +``` + +# 重要提醒 +1. 只输出JSON格式内容,不要添加任何解释说明 +2. 确保JSON格式严格正确,可以被程序直接解析 +3. 输入是 {{"narrations": [旁白数组]}} 格式,输出是 {{"video_prompts": [视频提示词数组]}} 格式 +4. **输出的video_prompts数组必须恰好包含 {narrations_count} 个元素,与输入的narrations数组一一对应** +5. **视频提示词必须使用英文**(for AI video generation models) +6. 视频提示词必须准确反映对应旁白的具体内容和情感 +7. 每个视频都要强调动态性和运动感,避免静态描述 +8. 适当使用镜头语言增强表现力 +9. 确保视频画面能增强文案的说服力和观众的理解度 + +现在,请为上述 {narrations_count} 个旁白创作对应的 {narrations_count} 个**英文**视频提示词。只输出JSON,不要其他内容。 +""" + + +def build_video_prompt_prompt( + narrations: List[str], + min_words: int, + max_words: int +) -> str: + """ + Build video prompt generation prompt + + Args: + narrations: List of narrations + min_words: Minimum word count + max_words: Maximum word count + + Returns: + Formatted prompt for LLM + + Example: + >>> build_video_prompt_prompt(narrations, 50, 100) + """ + narrations_json = json.dumps( + {"narrations": narrations}, + ensure_ascii=False, + indent=2 + ) + + return VIDEO_PROMPT_GENERATION_PROMPT.format( + narrations_json=narrations_json, + narrations_count=len(narrations), + min_words=min_words, + max_words=max_words + ) + diff --git a/pixelle_video/service.py b/pixelle_video/service.py index 5cdae0e..d2ca202 100644 --- a/pixelle_video/service.py +++ b/pixelle_video/service.py @@ -23,7 +23,7 @@ from loguru import logger from pixelle_video.config import config_manager from pixelle_video.services.llm_service import LLMService from pixelle_video.services.tts_service import TTSService -from pixelle_video.services.image import ImageService +from pixelle_video.services.media import MediaService from pixelle_video.services.video import VideoService from pixelle_video.services.frame_processor import FrameProcessor from pixelle_video.pipelines.standard import StandardPipeline @@ -45,7 +45,7 @@ class PixelleVideoCore: # Use capabilities directly answer = await pixelle_video.llm("Explain atomic habits") audio = await pixelle_video.tts("Hello world") - image = await pixelle_video.image(prompt="a cat") + media = await pixelle_video.media(prompt="a cat") # Check active capabilities print(f"Using LLM: {pixelle_video.llm.active}") @@ -56,7 +56,7 @@ class PixelleVideoCore: ├── config (configuration) ├── llm (LLM service - direct OpenAI SDK) ├── tts (TTS service - ComfyKit workflows) - ├── image (Image service - ComfyKit workflows) + ├── media (Media service - ComfyKit workflows, supports image & video) └── pipelines (video generation pipelines) ├── standard (standard workflow) ├── custom (custom workflow template) @@ -77,7 +77,7 @@ class PixelleVideoCore: # Core services (initialized in initialize()) self.llm: Optional[LLMService] = None self.tts: Optional[TTSService] = None - self.image: Optional[ImageService] = None + self.media: Optional[MediaService] = None self.video: Optional[VideoService] = None self.frame_processor: Optional[FrameProcessor] = None @@ -105,7 +105,7 @@ class PixelleVideoCore: # 1. Initialize core services self.llm = LLMService(self.config) self.tts = TTSService(self.config) - self.image = ImageService(self.config) + self.media = MediaService(self.config) self.video = VideoService() self.frame_processor = FrameProcessor(self) diff --git a/pixelle_video/services/__init__.py b/pixelle_video/services/__init__.py index fd4d282..77979c5 100644 --- a/pixelle_video/services/__init__.py +++ b/pixelle_video/services/__init__.py @@ -18,7 +18,7 @@ Core services providing atomic capabilities. Services: - LLMService: LLM text generation - TTSService: Text-to-speech -- ImageService: Image generation +- MediaService: Media generation (image & video) - VideoService: Video processing - FrameProcessor: Frame processing orchestrator - ComfyBaseService: Base class for ComfyUI-based services @@ -27,15 +27,19 @@ Services: from pixelle_video.services.comfy_base_service import ComfyBaseService from pixelle_video.services.llm_service import LLMService from pixelle_video.services.tts_service import TTSService -from pixelle_video.services.image import ImageService +from pixelle_video.services.media import MediaService from pixelle_video.services.video import VideoService from pixelle_video.services.frame_processor import FrameProcessor +# Backward compatibility alias +ImageService = MediaService + __all__ = [ "ComfyBaseService", "LLMService", "TTSService", - "ImageService", + "MediaService", + "ImageService", # Backward compatibility "VideoService", "FrameProcessor", ] diff --git a/pixelle_video/services/frame_html.py b/pixelle_video/services/frame_html.py index 4efd02d..01ffe70 100644 --- a/pixelle_video/services/frame_html.py +++ b/pixelle_video/services/frame_html.py @@ -77,21 +77,6 @@ class HTMLFrameGenerator: self._check_linux_dependencies() logger.debug(f"Loaded HTML template: {template_path} (size: {self.width}x{self.height})") - def requires_image(self) -> bool: - """ - Detect if template requires {{image}} parameter - - This method checks if the template uses the {{image}} variable. - If the template doesn't use images, the entire image generation - pipeline can be skipped, significantly improving: - - Generation speed (no image generation API calls) - - Cost efficiency (no LLM calls for image prompts) - - Dependency requirements (no ComfyUI needed) - - Returns: - True if template contains {{image}}, False otherwise - """ - return '{{image}}' in self.template def _check_linux_dependencies(self): """Check Linux system dependencies and warn if missing""" @@ -141,6 +126,58 @@ class HTMLFrameGenerator: logger.debug(f"Template loaded: {len(content)} chars") return content + def _parse_media_size_from_meta(self) -> tuple[Optional[int], Optional[int]]: + """ + Parse media size from meta tags in template + + Looks for meta tags: + - + - + + Returns: + Tuple of (width, height) or (None, None) if not found + """ + from bs4 import BeautifulSoup + + try: + soup = BeautifulSoup(self.template, 'html.parser') + + # Find width and height meta tags + width_meta = soup.find('meta', attrs={'name': 'template:media-width'}) + height_meta = soup.find('meta', attrs={'name': 'template:media-height'}) + + if width_meta and height_meta: + width = int(width_meta.get('content', 0)) + height = int(height_meta.get('content', 0)) + + if width > 0 and height > 0: + logger.debug(f"Found media size in meta tags: {width}x{height}") + return width, height + + return None, None + + except Exception as e: + logger.warning(f"Failed to parse media size from meta tags: {e}") + return None, None + + def get_media_size(self) -> tuple[int, int]: + """ + Get media size for image/video generation + + Returns media size specified in template meta tags. + + Returns: + Tuple of (width, height) + """ + media_width, media_height = self._parse_media_size_from_meta() + + if media_width and media_height: + return media_width, media_height + + # Fallback to default if not specified (should not happen with properly configured templates) + logger.warning(f"No media size meta tags found in template {self.template_path}, using fallback 1024x1024") + return 1024, 1024 + def parse_template_parameters(self) -> Dict[str, Dict[str, Any]]: """ Parse custom parameters from HTML template diff --git a/pixelle_video/services/frame_processor.py b/pixelle_video/services/frame_processor.py index 095eaf1..4767663 100644 --- a/pixelle_video/services/frame_processor.py +++ b/pixelle_video/services/frame_processor.py @@ -84,7 +84,7 @@ class FrameProcessor: )) await self._step_generate_audio(frame, config) - # Step 2: Generate image (conditional) + # Step 2: Generate media (image or video, conditional) if needs_image: if progress_callback: progress_callback(ProgressEvent( @@ -93,12 +93,13 @@ class FrameProcessor: frame_current=frame_num, frame_total=total_frames, step=2, - action="image" + action="media" )) - await self._step_generate_image(frame, config) + await self._step_generate_media(frame, config) else: frame.image_path = None - logger.debug(f" 2/4: Skipped image generation (not required by template)") + frame.media_type = None + logger.debug(f" 2/4: Skipped media generation (not required by template)") # Step 3: Compose frame (add subtitle) if progress_callback: @@ -176,27 +177,66 @@ class FrameProcessor: logger.debug(f" ✓ Audio generated: {audio_path} ({frame.duration:.2f}s)") - async def _step_generate_image( + async def _step_generate_media( self, frame: StoryboardFrame, config: StoryboardConfig ): - """Step 2: Generate image using ComfyKit""" - logger.debug(f" 2/4: Generating image for frame {frame.index}...") + """Step 2: Generate media (image or video) using ComfyKit""" + logger.debug(f" 2/4: Generating media for frame {frame.index}...") - # Call Image generation (with optional preset) - image_url = await self.core.image( + # Determine media type based on workflow + # video_ prefix in workflow name indicates video generation + workflow_name = config.image_workflow or "" + is_video_workflow = "video_" in workflow_name.lower() + media_type = "video" if is_video_workflow else "image" + + logger.debug(f" → Media type: {media_type} (workflow: {workflow_name})") + + # Call Media generation (with optional preset) + media_result = await self.core.media( prompt=frame.image_prompt, workflow=config.image_workflow, # Pass workflow from config (None = use default) + media_type=media_type, width=config.image_width, height=config.image_height ) - # Download image to local (pass task_id) - local_path = await self._download_image(image_url, frame.index, config.task_id) - frame.image_path = local_path + # Store media type + frame.media_type = media_result.media_type - logger.debug(f" ✓ Image generated: {local_path}") + if media_result.is_image: + # Download image to local (pass task_id) + local_path = await self._download_media( + media_result.url, + frame.index, + config.task_id, + media_type="image" + ) + frame.image_path = local_path + logger.debug(f" ✓ Image generated: {local_path}") + + elif media_result.is_video: + # Download video to local (pass task_id) + local_path = await self._download_media( + media_result.url, + frame.index, + config.task_id, + media_type="video" + ) + frame.video_path = local_path + + # Update duration from video if available + if media_result.duration: + frame.duration = media_result.duration + logger.debug(f" ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)") + else: + # Get video duration from file + frame.duration = await self._get_video_duration(local_path) + logger.debug(f" ✓ Video generated: {local_path} (duration: {frame.duration:.2f}s)") + + else: + raise ValueError(f"Unknown media type: {media_result.media_type}") async def _step_compose_frame( self, @@ -211,7 +251,9 @@ class FrameProcessor: from pixelle_video.utils.os_util import get_task_frame_path output_path = get_task_frame_path(config.task_id, frame.index, "composed") - # Use HTML template to compose frame + # For video type: render HTML as transparent overlay image + # For image type: render HTML with image background + # In both cases, we need the composed image composed_path = await self._compose_frame_html(frame, storyboard, config, output_path) frame.composed_image_path = composed_path @@ -264,23 +306,60 @@ class FrameProcessor: frame: StoryboardFrame, config: StoryboardConfig ): - """Step 4: Create video segment from image + audio""" + """Step 4: Create video segment from media + audio""" logger.debug(f" 4/4: Creating video segment for frame {frame.index}...") # Generate output path using task_id from pixelle_video.utils.os_util import get_task_frame_path output_path = get_task_frame_path(config.task_id, frame.index, "segment") - # Call video compositor to create video from image + audio from pixelle_video.services.video import VideoService video_service = VideoService() - segment_path = video_service.create_video_from_image( - image=frame.composed_image_path, - audio=frame.audio_path, - output=output_path, - fps=config.video_fps - ) + # Branch based on media type + if frame.media_type == "video": + # Video workflow: overlay HTML template on video, then add audio + logger.debug(f" → Using video-based composition with HTML overlay") + + # Step 1: Overlay transparent HTML image on video + # The composed_image_path contains the rendered HTML with transparent background + temp_video_with_overlay = get_task_frame_path(config.task_id, frame.index, "video") + "_overlay.mp4" + + video_service.overlay_image_on_video( + video=frame.video_path, + overlay_image=frame.composed_image_path, + output=temp_video_with_overlay, + scale_mode="contain" # Scale video to fit template size (contain mode) + ) + + # Step 2: Add narration audio to the overlaid video + # Note: The video might have audio (replaced) or be silent (audio added) + segment_path = video_service.merge_audio_video( + video=temp_video_with_overlay, + audio=frame.audio_path, + output=output_path, + replace_audio=True, # Replace video audio with narration + audio_volume=1.0 + ) + + # Clean up temp file + import os + if os.path.exists(temp_video_with_overlay): + os.unlink(temp_video_with_overlay) + + elif frame.media_type == "image" or frame.media_type is None: + # Image workflow: create video from image + audio + logger.debug(f" → Using image-based composition") + + segment_path = video_service.create_video_from_image( + image=frame.composed_image_path, + audio=frame.audio_path, + output=output_path, + fps=config.video_fps + ) + + else: + raise ValueError(f"Unknown media type: {frame.media_type}") frame.video_segment_path = segment_path @@ -303,10 +382,16 @@ class FrameProcessor: estimated_duration = file_size / 2000 return max(1.0, estimated_duration) # At least 1 second - async def _download_image(self, url: str, frame_index: int, task_id: str) -> str: - """Download image from URL to local file""" + async def _download_media( + self, + url: str, + frame_index: int, + task_id: str, + media_type: str + ) -> str: + """Download media (image or video) from URL to local file""" from pixelle_video.utils.os_util import get_task_frame_path - output_path = get_task_frame_path(task_id, frame_index, "image") + output_path = get_task_frame_path(task_id, frame_index, media_type) async with httpx.AsyncClient() as client: response = await client.get(url) @@ -316,4 +401,16 @@ class FrameProcessor: f.write(response.content) return output_path + + async def _get_video_duration(self, video_path: str) -> float: + """Get video duration in seconds""" + try: + import ffmpeg + probe = ffmpeg.probe(video_path) + duration = float(probe['format']['duration']) + return duration + except Exception as e: + logger.warning(f"Failed to get video duration: {e}, using audio duration") + # Fallback: use audio duration if available + return 1.0 # Default to 1 second if unable to determine diff --git a/pixelle_video/services/image.py b/pixelle_video/services/image.py deleted file mode 100644 index 83c2c72..0000000 --- a/pixelle_video/services/image.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright (C) 2025 AIDC-AI -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Image Generation Service - ComfyUI Workflow-based implementation -""" - -from typing import Optional - -from comfykit import ComfyKit -from loguru import logger - -from pixelle_video.services.comfy_base_service import ComfyBaseService - - -class ImageService(ComfyBaseService): - """ - Image generation service - Workflow-based - - Uses ComfyKit to execute image generation workflows. - - Usage: - # Use default workflow (workflows/image_flux.json) - image_url = await pixelle_video.image(prompt="a cat") - - # Use specific workflow - image_url = await pixelle_video.image( - prompt="a cat", - workflow="image_flux.json" - ) - - # List available workflows - workflows = pixelle_video.image.list_workflows() - """ - - WORKFLOW_PREFIX = "image_" - DEFAULT_WORKFLOW = None # No hardcoded default, must be configured - WORKFLOWS_DIR = "workflows" - - def __init__(self, config: dict): - """ - Initialize image service - - Args: - config: Full application config dict - """ - super().__init__(config, service_name="image") - - async def __call__( - self, - prompt: str, - workflow: Optional[str] = None, - # ComfyUI connection (optional overrides) - comfyui_url: Optional[str] = None, - runninghub_api_key: Optional[str] = None, - # Common workflow parameters - width: Optional[int] = None, - height: Optional[int] = None, - negative_prompt: Optional[str] = None, - steps: Optional[int] = None, - seed: Optional[int] = None, - cfg: Optional[float] = None, - sampler: Optional[str] = None, - **params - ) -> str: - """ - Generate image using workflow - - Args: - prompt: Image generation prompt - workflow: Workflow filename (default: from config or "image_flux.json") - comfyui_url: ComfyUI URL (optional, overrides config) - runninghub_api_key: RunningHub API key (optional, overrides config) - width: Image width - height: Image height - negative_prompt: Negative prompt - steps: Sampling steps - seed: Random seed - cfg: CFG scale - sampler: Sampler name - **params: Additional workflow parameters - - Returns: - Generated image URL/path - - Examples: - # Simplest: use default workflow (workflows/image_flux.json) - image_url = await pixelle_video.image(prompt="a beautiful cat") - - # Use specific workflow - image_url = await pixelle_video.image( - prompt="a cat", - workflow="image_flux.json" - ) - - # With additional parameters - image_url = await pixelle_video.image( - prompt="a cat", - workflow="image_flux.json", - width=1024, - height=1024, - steps=20, - seed=42 - ) - - # With absolute path - image_url = await pixelle_video.image( - prompt="a cat", - workflow="/path/to/custom.json" - ) - - # With custom ComfyUI server - image_url = await pixelle_video.image( - prompt="a cat", - comfyui_url="http://192.168.1.100:8188" - ) - """ - # 1. Resolve workflow (returns structured info) - workflow_info = self._resolve_workflow(workflow=workflow) - - # 2. Prepare ComfyKit config (supports both selfhost and runninghub) - kit_config = self._prepare_comfykit_config( - comfyui_url=comfyui_url, - runninghub_api_key=runninghub_api_key - ) - - # 3. Build workflow parameters - workflow_params = {"prompt": prompt} - - # Add optional parameters - if width is not None: - workflow_params["width"] = width - if height is not None: - workflow_params["height"] = height - if negative_prompt is not None: - workflow_params["negative_prompt"] = negative_prompt - if steps is not None: - workflow_params["steps"] = steps - if seed is not None: - workflow_params["seed"] = seed - if cfg is not None: - workflow_params["cfg"] = cfg - if sampler is not None: - workflow_params["sampler"] = sampler - - # Add any additional parameters - workflow_params.update(params) - - logger.debug(f"Workflow parameters: {workflow_params}") - - # 4. Execute workflow (ComfyKit auto-detects based on input type) - try: - kit = ComfyKit(**kit_config) - - # Determine what to pass to ComfyKit based on source - if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info: - # RunningHub: pass workflow_id (ComfyKit will use runninghub backend) - workflow_input = workflow_info["workflow_id"] - logger.info(f"Executing RunningHub workflow: {workflow_input}") - else: - # Selfhost: pass file path (ComfyKit will use local ComfyUI) - workflow_input = workflow_info["path"] - logger.info(f"Executing selfhost workflow: {workflow_input}") - - result = await kit.execute(workflow_input, workflow_params) - - # 5. Handle result - if result.status != "completed": - error_msg = result.msg or "Unknown error" - logger.error(f"Image generation failed: {error_msg}") - raise Exception(f"Image generation failed: {error_msg}") - - if not result.images: - logger.error("No images generated") - raise Exception("No images generated") - - image_url = result.images[0] - logger.info(f"✅ Generated image: {image_url}") - return image_url - - except Exception as e: - logger.error(f"Image generation error: {e}") - raise diff --git a/pixelle_video/services/media.py b/pixelle_video/services/media.py new file mode 100644 index 0000000..c915df1 --- /dev/null +++ b/pixelle_video/services/media.py @@ -0,0 +1,285 @@ +# Copyright (C) 2025 AIDC-AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Media Generation Service - ComfyUI Workflow-based implementation + +Supports both image and video generation workflows. +Automatically detects output type based on ExecuteResult. +""" + +from typing import Optional + +from comfykit import ComfyKit +from loguru import logger + +from pixelle_video.services.comfy_base_service import ComfyBaseService +from pixelle_video.models.media import MediaResult + + +class MediaService(ComfyBaseService): + """ + Media generation service - Workflow-based + + Uses ComfyKit to execute image/video generation workflows. + Supports both image_ and video_ workflow prefixes. + + Usage: + # Use default workflow (workflows/image_flux.json) + media = await pixelle_video.media(prompt="a cat") + if media.is_image: + print(f"Generated image: {media.url}") + elif media.is_video: + print(f"Generated video: {media.url} ({media.duration}s)") + + # Use specific workflow + media = await pixelle_video.media( + prompt="a cat", + workflow="image_flux.json" + ) + + # List available workflows + workflows = pixelle_video.media.list_workflows() + """ + + WORKFLOW_PREFIX = "" # Will be overridden by _scan_workflows + DEFAULT_WORKFLOW = None # No hardcoded default, must be configured + WORKFLOWS_DIR = "workflows" + + def __init__(self, config: dict): + """ + Initialize media service + + Args: + config: Full application config dict + """ + super().__init__(config, service_name="image") # Keep "image" for config compatibility + + def _scan_workflows(self): + """ + Scan workflows for both image_ and video_ prefixes + + Override parent method to support multiple prefixes + """ + from pixelle_video.utils.os_util import list_resource_dirs, list_resource_files, get_resource_path + from pathlib import Path + + workflows = [] + + # Get all workflow source directories + source_dirs = list_resource_dirs("workflows") + + if not source_dirs: + logger.warning("No workflow source directories found") + return workflows + + # Scan each source directory for workflow files + for source_name in source_dirs: + # Get all JSON files for this source + workflow_files = list_resource_files("workflows", source_name) + + # Filter to only files matching image_ or video_ prefix + matching_files = [ + f for f in workflow_files + if (f.startswith("image_") or f.startswith("video_")) and f.endswith('.json') + ] + + for filename in matching_files: + try: + # Get actual file path + file_path = Path(get_resource_path("workflows", source_name, filename)) + workflow_info = self._parse_workflow_file(file_path, source_name) + workflows.append(workflow_info) + logger.debug(f"Found workflow: {workflow_info['key']}") + except Exception as e: + logger.error(f"Failed to parse workflow {source_name}/{filename}: {e}") + + # Sort by key (source/name) + return sorted(workflows, key=lambda w: w["key"]) + + async def __call__( + self, + prompt: str, + workflow: Optional[str] = None, + # Media type specification (required for proper handling) + media_type: str = "image", # "image" or "video" + # ComfyUI connection (optional overrides) + comfyui_url: Optional[str] = None, + runninghub_api_key: Optional[str] = None, + # Common workflow parameters + width: Optional[int] = None, + height: Optional[int] = None, + negative_prompt: Optional[str] = None, + steps: Optional[int] = None, + seed: Optional[int] = None, + cfg: Optional[float] = None, + sampler: Optional[str] = None, + **params + ) -> MediaResult: + """ + Generate media (image or video) using workflow + + Media type must be specified explicitly via media_type parameter. + Returns a MediaResult object containing media type and URL. + + Args: + prompt: Media generation prompt + workflow: Workflow filename (default: from config or "image_flux.json") + media_type: Type of media to generate - "image" or "video" (default: "image") + comfyui_url: ComfyUI URL (optional, overrides config) + runninghub_api_key: RunningHub API key (optional, overrides config) + width: Media width + height: Media height + negative_prompt: Negative prompt + steps: Sampling steps + seed: Random seed + cfg: CFG scale + sampler: Sampler name + **params: Additional workflow parameters + + Returns: + MediaResult object with media_type ("image" or "video") and url + + Examples: + # Simplest: use default workflow (workflows/image_flux.json) + media = await pixelle_video.media(prompt="a beautiful cat") + if media.is_image: + print(f"Image: {media.url}") + + # Use specific workflow + media = await pixelle_video.media( + prompt="a cat", + workflow="image_flux.json" + ) + + # Video workflow + media = await pixelle_video.media( + prompt="a cat running", + workflow="image_video.json" + ) + if media.is_video: + print(f"Video: {media.url}, duration: {media.duration}s") + + # With additional parameters + media = await pixelle_video.media( + prompt="a cat", + workflow="image_flux.json", + width=1024, + height=1024, + steps=20, + seed=42 + ) + + # With absolute path + media = await pixelle_video.media( + prompt="a cat", + workflow="/path/to/custom.json" + ) + + # With custom ComfyUI server + media = await pixelle_video.media( + prompt="a cat", + comfyui_url="http://192.168.1.100:8188" + ) + """ + # 1. Resolve workflow (returns structured info) + workflow_info = self._resolve_workflow(workflow=workflow) + + # 2. Prepare ComfyKit config (supports both selfhost and runninghub) + kit_config = self._prepare_comfykit_config( + comfyui_url=comfyui_url, + runninghub_api_key=runninghub_api_key + ) + + # 3. Build workflow parameters + workflow_params = {"prompt": prompt} + + # Add optional parameters + if width is not None: + workflow_params["width"] = width + if height is not None: + workflow_params["height"] = height + if negative_prompt is not None: + workflow_params["negative_prompt"] = negative_prompt + if steps is not None: + workflow_params["steps"] = steps + if seed is not None: + workflow_params["seed"] = seed + if cfg is not None: + workflow_params["cfg"] = cfg + if sampler is not None: + workflow_params["sampler"] = sampler + + # Add any additional parameters + workflow_params.update(params) + + logger.debug(f"Workflow parameters: {workflow_params}") + + # 4. Execute workflow (ComfyKit auto-detects based on input type) + try: + kit = ComfyKit(**kit_config) + + # Determine what to pass to ComfyKit based on source + if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info: + # RunningHub: pass workflow_id (ComfyKit will use runninghub backend) + workflow_input = workflow_info["workflow_id"] + logger.info(f"Executing RunningHub workflow: {workflow_input}") + else: + # Selfhost: pass file path (ComfyKit will use local ComfyUI) + workflow_input = workflow_info["path"] + logger.info(f"Executing selfhost workflow: {workflow_input}") + + result = await kit.execute(workflow_input, workflow_params) + + # 5. Handle result based on specified media_type + if result.status != "completed": + error_msg = result.msg or "Unknown error" + logger.error(f"Media generation failed: {error_msg}") + raise Exception(f"Media generation failed: {error_msg}") + + # Extract media based on specified type + if media_type == "video": + # Video workflow - get video from result + if not result.videos: + logger.error("No video generated (workflow returned no videos)") + raise Exception("No video generated") + + video_url = result.videos[0] + logger.info(f"✅ Generated video: {video_url}") + + # Try to extract duration from result (if available) + duration = None + if hasattr(result, 'duration') and result.duration: + duration = result.duration + + return MediaResult( + media_type="video", + url=video_url, + duration=duration + ) + else: # image + # Image workflow - get image from result + if not result.images: + logger.error("No image generated (workflow returned no images)") + raise Exception("No image generated") + + image_url = result.images[0] + logger.info(f"✅ Generated image: {image_url}") + + return MediaResult( + media_type="image", + url=image_url + ) + + except Exception as e: + logger.error(f"Media generation error: {e}") + raise diff --git a/pixelle_video/services/video.py b/pixelle_video/services/video.py index d9e8a8b..5cbe31c 100644 --- a/pixelle_video/services/video.py +++ b/pixelle_video/services/video.py @@ -224,20 +224,88 @@ class VideoService: -map "[v]" -map "[a]" output.mp4 """ try: - inputs = [ffmpeg.input(v) for v in videos] - ( - ffmpeg - .concat(*inputs, v=1, a=1) - .output(output) - .overwrite_output() - .run(capture_stdout=True, capture_stderr=True) + # Build filter_complex string manually + n = len(videos) + + # Build input stream labels: [0:v][0:a][1:v][1:a]... + stream_spec = "".join([f"[{i}:v][{i}:a]" for i in range(n)]) + filter_complex = f"{stream_spec}concat=n={n}:v=1:a=1[v][a]" + + # Build ffmpeg command + cmd = ['ffmpeg'] + for video in videos: + cmd.extend(['-i', video]) + cmd.extend([ + '-filter_complex', filter_complex, + '-map', '[v]', + '-map', '[a]', + '-y', # Overwrite output + output + ]) + + # Run command + import subprocess + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True ) + logger.success(f"Videos concatenated successfully: {output}") return output - except ffmpeg.Error as e: - error_msg = e.stderr.decode() if e.stderr else str(e) + except subprocess.CalledProcessError as e: + error_msg = e.stderr if e.stderr else str(e) logger.error(f"FFmpeg concat filter error: {error_msg}") raise RuntimeError(f"Failed to concatenate videos: {error_msg}") + except Exception as e: + logger.error(f"Concatenation error: {e}") + raise RuntimeError(f"Failed to concatenate videos: {e}") + + def _get_video_duration(self, video: str) -> float: + """Get video duration in seconds""" + try: + probe = ffmpeg.probe(video) + duration = float(probe['format']['duration']) + return duration + except Exception as e: + logger.warning(f"Failed to get video duration: {e}") + return 0.0 + + def _get_audio_duration(self, audio: str) -> float: + """Get audio duration in seconds""" + try: + probe = ffmpeg.probe(audio) + duration = float(probe['format']['duration']) + return duration + except Exception as e: + logger.warning(f"Failed to get audio duration: {e}, using estimate") + # Fallback: estimate based on file size (very rough) + import os + file_size = os.path.getsize(audio) + # Assume ~16kbps for MP3, so 2KB per second + estimated_duration = file_size / 2000 + return max(1.0, estimated_duration) # At least 1 second + + def has_audio_stream(self, video: str) -> bool: + """ + Check if video has audio stream + + Args: + video: Video file path + + Returns: + True if video has audio stream, False otherwise + """ + try: + probe = ffmpeg.probe(video) + audio_streams = [s for s in probe.get('streams', []) if s['codec_type'] == 'audio'] + has_audio = len(audio_streams) > 0 + logger.debug(f"Video {video} has_audio={has_audio}") + return has_audio + except Exception as e: + logger.warning(f"Failed to probe video audio streams: {e}, assuming no audio") + return False def merge_audio_video( self, @@ -247,9 +315,18 @@ class VideoService: replace_audio: bool = True, audio_volume: float = 1.0, video_volume: float = 0.0, + pad_strategy: str = "freeze", # "freeze" (freeze last frame) or "black" (black screen) ) -> str: """ - Merge audio with video + Merge audio with video, using the longer duration + + The output video duration will be the maximum of video and audio duration. + If audio is longer than video, the video will be padded using the specified strategy. + + Automatically handles videos with or without audio streams. + - If video has no audio: adds the audio track + - If video has audio and replace_audio=True: replaces with new audio + - If video has audio and replace_audio=False: mixes both audio tracks Args: video: Video file path @@ -259,6 +336,9 @@ class VideoService: audio_volume: Volume of the new audio (0.0 to 1.0+) video_volume: Volume of original video audio (0.0 to 1.0+) Only used when replace_audio=False + pad_strategy: Strategy to pad video if audio is longer + - "freeze": Freeze last frame (default) + - "black": Fill with black screen Returns: Path to the output video file @@ -267,28 +347,115 @@ class VideoService: RuntimeError: If FFmpeg execution fails Note: - - When replace_audio=True, video's original audio is removed - - When replace_audio=False, original and new audio are mixed - - Audio is trimmed/extended to match video duration + - Uses the longer duration between video and audio + - When audio is longer, video is padded using pad_strategy + - When video is longer, audio is looped or extended + - Automatically detects if video has audio + - When video is silent, audio is added regardless of replace_audio + - When replace_audio=True and video has audio, original audio is removed + - When replace_audio=False and video has audio, original and new audio are mixed """ + # Get durations of video and audio + video_duration = self._get_video_duration(video) + audio_duration = self._get_audio_duration(audio) + + logger.info(f"Video duration: {video_duration:.2f}s, Audio duration: {audio_duration:.2f}s") + + # Determine target duration (max of both) + target_duration = max(video_duration, audio_duration) + logger.info(f"Target output duration: {target_duration:.2f}s") + + # Check if video has audio stream + video_has_audio = self.has_audio_stream(video) + + # Prepare video stream (potentially with padding) + input_video = ffmpeg.input(video) + video_stream = input_video.video + + # Pad video if audio is longer + if audio_duration > video_duration: + pad_duration = audio_duration - video_duration + logger.info(f"Audio is longer, padding video by {pad_duration:.2f}s using '{pad_strategy}' strategy") + + if pad_strategy == "freeze": + # Freeze last frame: tpad filter + video_stream = video_stream.filter('tpad', stop_mode='clone', stop_duration=pad_duration) + else: # black + # Generate black frames for padding duration + from pixelle_video.utils.os_util import get_temp_path + import os + + # Get video properties + probe = ffmpeg.probe(video) + video_info = next(s for s in probe['streams'] if s['codec_type'] == 'video') + width = int(video_info['width']) + height = int(video_info['height']) + fps_str = video_info['r_frame_rate'] + fps_num, fps_den = map(int, fps_str.split('/')) + fps = fps_num / fps_den if fps_den != 0 else 30 + + # Create black video for padding + black_video_path = get_temp_path(f"black_pad_{os.path.basename(output)}") + black_input = ffmpeg.input( + f'color=c=black:s={width}x{height}:r={fps}', + f='lavfi', + t=pad_duration + ) + + # Concatenate original video with black padding + video_stream = ffmpeg.concat(video_stream, black_input.video, v=1, a=0) + + # Prepare audio stream (pad if needed to match target duration) + input_audio = ffmpeg.input(audio) + audio_stream = input_audio.audio.filter('volume', audio_volume) + + # Pad audio with silence if video is longer + if video_duration > audio_duration: + pad_duration = video_duration - audio_duration + logger.info(f"Video is longer, padding audio with {pad_duration:.2f}s silence") + # Use apad to add silence at the end + audio_stream = audio_stream.filter('apad', whole_dur=target_duration) + + if not video_has_audio: + logger.info(f"Video has no audio stream, adding audio track") + # Video is silent, just add the audio + try: + ( + ffmpeg + .output( + video_stream, + audio_stream, + output, + vcodec='libx264', # Re-encode video if padded + acodec='aac', + audio_bitrate='192k' + ) + .overwrite_output() + .run(capture_stdout=True, capture_stderr=True) + ) + + logger.success(f"Audio added to silent video: {output}") + return output + except ffmpeg.Error as e: + error_msg = e.stderr.decode() if e.stderr else str(e) + logger.error(f"FFmpeg error adding audio to silent video: {error_msg}") + raise RuntimeError(f"Failed to add audio to video: {error_msg}") + + # Video has audio, proceed with merging logger.info(f"Merging audio with video (replace={replace_audio})") try: - input_video = ffmpeg.input(video) - input_audio = ffmpeg.input(audio) - if replace_audio: # Replace audio: use only new audio, ignore original ( ffmpeg .output( - input_video.video, - input_audio.audio.filter('volume', audio_volume), + video_stream, + audio_stream, output, - vcodec='copy', + vcodec='libx264', # Re-encode video if padded acodec='aac', - audio_bitrate='192k', - shortest=None + audio_bitrate='192k' ) .overwrite_output() .run(capture_stdout=True, capture_stderr=True) @@ -298,20 +465,20 @@ class VideoService: mixed_audio = ffmpeg.filter( [ input_video.audio.filter('volume', video_volume), - input_audio.audio.filter('volume', audio_volume) + audio_stream ], 'amix', inputs=2, - duration='first' + duration='longest' # Use longest audio ) ( ffmpeg .output( - input_video.video, + video_stream, mixed_audio, output, - vcodec='copy', + vcodec='libx264', # Re-encode video if padded acodec='aac', audio_bitrate='192k' ) @@ -326,6 +493,92 @@ class VideoService: logger.error(f"FFmpeg merge error: {error_msg}") raise RuntimeError(f"Failed to merge audio and video: {error_msg}") + def overlay_image_on_video( + self, + video: str, + overlay_image: str, + output: str, + scale_mode: str = "contain" + ) -> str: + """ + Overlay a transparent image on top of video + + Args: + video: Base video file path + overlay_image: Transparent overlay image path (e.g., rendered HTML with transparent background) + output: Output video file path + scale_mode: How to scale the base video to fit the overlay size + - "contain": Scale video to fit within overlay dimensions (letterbox/pillarbox) + - "cover": Scale video to cover overlay dimensions (may crop) + - "stretch": Stretch video to exact overlay dimensions + + Returns: + Path to the output video file + + Raises: + RuntimeError: If FFmpeg execution fails + + Note: + - Overlay image should have transparent background + - Video is scaled to match overlay dimensions based on scale_mode + - Final video size matches overlay image size + - Video codec is re-encoded to support overlay + """ + logger.info(f"Overlaying image on video (scale_mode={scale_mode})") + + try: + # Get overlay image dimensions + overlay_probe = ffmpeg.probe(overlay_image) + overlay_stream = next(s for s in overlay_probe['streams'] if s['codec_type'] == 'video') + overlay_width = int(overlay_stream['width']) + overlay_height = int(overlay_stream['height']) + + logger.debug(f"Overlay dimensions: {overlay_width}x{overlay_height}") + + input_video = ffmpeg.input(video) + input_overlay = ffmpeg.input(overlay_image) + + # Scale video to fit overlay size using scale_mode + if scale_mode == "contain": + # Scale to fit (letterbox/pillarbox if aspect ratio differs) + # Use scale filter with force_original_aspect_ratio=decrease and pad to center + scaled_video = ( + input_video + .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='decrease') + .filter('pad', overlay_width, overlay_height, '(ow-iw)/2', '(oh-ih)/2', color='black') + ) + elif scale_mode == "cover": + # Scale to cover (crop if aspect ratio differs) + scaled_video = ( + input_video + .filter('scale', overlay_width, overlay_height, force_original_aspect_ratio='increase') + .filter('crop', overlay_width, overlay_height) + ) + else: # stretch + # Stretch to exact dimensions + scaled_video = input_video.filter('scale', overlay_width, overlay_height) + + # Overlay the transparent image on top of the scaled video + output_stream = ffmpeg.overlay(scaled_video, input_overlay) + + ( + ffmpeg + .output(output_stream, output, + vcodec='libx264', + pix_fmt='yuv420p', + preset='medium', + crf=23) + .overwrite_output() + .run(capture_stdout=True, capture_stderr=True) + ) + + logger.success(f"Image overlaid on video: {output}") + return output + except ffmpeg.Error as e: + error_msg = e.stderr.decode() if e.stderr else str(e) + logger.error(f"FFmpeg overlay error: {error_msg}") + raise RuntimeError(f"Failed to overlay image on video: {error_msg}") + def create_video_from_image( self, image: str, diff --git a/pixelle_video/utils/content_generators.py b/pixelle_video/utils/content_generators.py index bbda711..21b4d59 100644 --- a/pixelle_video/utils/content_generators.py +++ b/pixelle_video/utils/content_generators.py @@ -321,6 +321,98 @@ async def generate_image_prompts( return all_prompts +async def generate_video_prompts( + llm_service, + narrations: List[str], + min_words: int = 30, + max_words: int = 60, + batch_size: int = 10, + max_retries: int = 3, + progress_callback: Optional[callable] = None +) -> List[str]: + """ + Generate video prompts from narrations (with batching and retry) + + Args: + llm_service: LLM service instance + narrations: List of narrations + min_words: Min video prompt length + max_words: Max video prompt length + batch_size: Max narrations per batch (default: 10) + max_retries: Max retry attempts per batch (default: 3) + progress_callback: Optional callback(completed, total, message) for progress updates + + Returns: + List of video prompts (base prompts, without prefix applied) + """ + from pixelle_video.prompts.video_generation import build_video_prompt_prompt + + logger.info(f"Generating video prompts for {len(narrations)} narrations (batch_size={batch_size})") + + # Split narrations into batches + batches = [narrations[i:i + batch_size] for i in range(0, len(narrations), batch_size)] + logger.info(f"Split into {len(batches)} batches") + + all_prompts = [] + + # Process each batch + for batch_idx, batch_narrations in enumerate(batches, 1): + logger.info(f"Processing batch {batch_idx}/{len(batches)} ({len(batch_narrations)} narrations)") + + # Retry logic for this batch + for attempt in range(1, max_retries + 1): + try: + # Generate prompts for this batch + prompt = build_video_prompt_prompt( + narrations=batch_narrations, + min_words=min_words, + max_words=max_words + ) + + response = await llm_service( + prompt=prompt, + temperature=0.7, + max_tokens=8192 + ) + + logger.debug(f"Batch {batch_idx} attempt {attempt}: LLM response length: {len(response)} chars") + + # Parse JSON + result = _parse_json(response) + + if "video_prompts" not in result: + raise KeyError("Invalid response format: missing 'video_prompts'") + + batch_prompts = result["video_prompts"] + + # Validate batch result + if len(batch_prompts) != len(batch_narrations): + raise ValueError( + f"Prompt count mismatch: expected {len(batch_narrations)}, got {len(batch_prompts)}" + ) + + # Success - add to all_prompts + all_prompts.extend(batch_prompts) + logger.info(f"✓ Batch {batch_idx} completed: {len(batch_prompts)} video prompts") + + # Report progress + if progress_callback: + completed = len(all_prompts) + total = len(narrations) + progress_callback(completed, total, f"Batch {batch_idx}/{len(batches)} completed") + + break # Success, move to next batch + + except Exception as e: + logger.warning(f"✗ Batch {batch_idx} attempt {attempt} failed: {e}") + if attempt >= max_retries: + raise + logger.info(f"Retrying batch {batch_idx}...") + + logger.info(f"✅ Generated {len(all_prompts)} video prompts") + return all_prompts + + def _parse_json(text: str) -> dict: """ Parse JSON from text, with fallback to extract JSON from markdown code blocks diff --git a/pixelle_video/utils/os_util.py b/pixelle_video/utils/os_util.py index 3538f7e..12c26dc 100644 --- a/pixelle_video/utils/os_util.py +++ b/pixelle_video/utils/os_util.py @@ -260,7 +260,7 @@ def get_task_path(task_id: str, *paths: str) -> str: def get_task_frame_path( task_id: str, frame_index: int, - file_type: Literal["audio", "image", "composed", "segment"] + file_type: Literal["audio", "image", "video", "composed", "segment"] ) -> str: """ Get frame file path within task directory @@ -268,7 +268,7 @@ def get_task_frame_path( Args: task_id: Task ID frame_index: Frame index (0-based internally, but filename starts from 01) - file_type: File type (audio/image/composed/segment) + file_type: File type (audio/image/video/composed/segment) Returns: Absolute path to frame file @@ -280,6 +280,7 @@ def get_task_frame_path( ext_map = { "audio": "mp3", "image": "png", + "video": "mp4", "composed": "png", "segment": "mp4" } diff --git a/pixelle_video/utils/template_util.py b/pixelle_video/utils/template_util.py index 501a723..6fa0827 100644 --- a/pixelle_video/utils/template_util.py +++ b/pixelle_video/utils/template_util.py @@ -18,6 +18,7 @@ import os from pathlib import Path from typing import List, Tuple, Optional, Literal from pydantic import BaseModel, Field +import logging from pixelle_video.utils.os_util import ( get_resource_path, @@ -26,6 +27,8 @@ from pixelle_video.utils.os_util import ( resource_exists ) +logger = logging.getLogger(__name__) + def parse_template_size(template_path: str) -> Tuple[int, int]: """ @@ -316,7 +319,7 @@ def resolve_template_path(template_input: Optional[str]) -> str: Args: template_input: Can be: - - None: Use default "1080x1920/default.html" + - None: Use default "1080x1920/image_default.html" - "template.html": Use default size + this template - "1080x1920/template.html": Full relative path - "templates/1080x1920/template.html": Absolute-ish path (legacy) @@ -330,15 +333,15 @@ def resolve_template_path(template_input: Optional[str]) -> str: Examples: >>> resolve_template_path(None) - 'templates/1080x1920/default.html' - >>> resolve_template_path("modern.html") - 'templates/1080x1920/modern.html' - >>> resolve_template_path("1920x1080/default.html") - 'templates/1920x1080/default.html' + 'templates/1080x1920/image_default.html' + >>> resolve_template_path("image_modern.html") + 'templates/1080x1920/image_modern.html' + >>> resolve_template_path("1920x1080/image_default.html") + 'templates/1920x1080/image_default.html' """ # Default case if template_input is None: - template_input = "1080x1920/default.html" + template_input = "1080x1920/image_default.html" # Parse input to extract size and template name size = None @@ -359,6 +362,18 @@ def resolve_template_path(template_input: Optional[str]) -> str: size = "1080x1920" template_name = template_input + # Backward compatibility: migrate "default.html" to "image_default.html" + if template_name == "default.html": + migrated_name = "image_default.html" + try: + # Try migrated name first + path = get_resource_path("templates", size, migrated_name) + logger.info(f"Backward compatibility: migrated '{template_input}' to '{size}/{migrated_name}'") + return path + except FileNotFoundError: + # Fall through to try original name + logger.warning(f"Migrated template '{size}/{migrated_name}' not found, trying original name") + # Use resource API to resolve path (custom > default) try: return get_resource_path("templates", size, template_name) @@ -367,6 +382,120 @@ def resolve_template_path(template_input: Optional[str]) -> str: raise FileNotFoundError( f"Template not found: {size}/{template_name}\n" f"Available sizes: {available_sizes}\n" - f"Hint: Use format 'SIZExSIZE/template.html' (e.g., '1080x1920/default.html')" + f"Hint: Use format 'SIZExSIZE/template.html' (e.g., '1080x1920/image_default.html')" ) + +def get_template_type(template_name: str) -> Literal['static', 'image', 'video']: + """ + Detect template type from template filename + + Template naming convention: + - static_*.html: Static style templates (no AI-generated media) + - image_*.html: Templates requiring AI-generated images + - video_*.html: Templates requiring AI-generated videos + + Args: + template_name: Template filename like "image_default.html" or "video_simple.html" + + Returns: + Template type: 'static', 'image', or 'video' + + Examples: + >>> get_template_type("static_simple.html") + 'static' + >>> get_template_type("image_default.html") + 'image' + >>> get_template_type("video_simple.html") + 'video' + """ + name = Path(template_name).name + + if name.startswith("static_"): + return "static" + elif name.startswith("video_"): + return "video" + elif name.startswith("image_"): + return "image" + else: + # Fallback: try to detect from legacy names + logger.warning( + f"Template '{template_name}' doesn't follow naming convention (static_/image_/video_). " + f"Defaulting to 'image' type." + ) + return "image" + + +def filter_templates_by_type( + templates: List[TemplateInfo], + template_type: Literal['static', 'image', 'video'] +) -> List[TemplateInfo]: + """ + Filter templates by type + + Args: + templates: List of TemplateInfo objects + template_type: Type to filter by ('static', 'image', or 'video') + + Returns: + Filtered list of TemplateInfo objects + + Examples: + >>> all_templates = get_all_templates_with_info() + >>> image_templates = filter_templates_by_type(all_templates, 'image') + >>> len(image_templates) > 0 + True + """ + filtered = [] + for t in templates: + template_name = t.display_info.name + if get_template_type(template_name) == template_type: + filtered.append(t) + return filtered + + +def get_templates_grouped_by_size_and_type( + template_type: Optional[Literal['static', 'image', 'video']] = None +) -> dict: + """ + Get templates grouped by size, optionally filtered by type + + Args: + template_type: Optional type filter ('static', 'image', or 'video') + + Returns: + Dict with size as key, list of TemplateInfo as value + Ordered by orientation priority: portrait > landscape > square + + Examples: + >>> # Get all templates + >>> all_grouped = get_templates_grouped_by_size_and_type() + + >>> # Get only image templates + >>> image_grouped = get_templates_grouped_by_size_and_type('image') + """ + from collections import defaultdict + + templates = get_all_templates_with_info() + + # Filter by type if specified + if template_type is not None: + templates = filter_templates_by_type(templates, template_type) + + grouped = defaultdict(list) + + for t in templates: + grouped[t.display_info.size].append(t) + + # Sort groups by orientation priority: portrait > landscape > square + orientation_priority = {'portrait': 0, 'landscape': 1, 'square': 2} + + sorted_grouped = {} + for size in sorted(grouped.keys(), key=lambda s: ( + orientation_priority.get(grouped[s][0].display_info.orientation, 3), + s + )): + sorted_grouped[size] = sorted(grouped[size], key=lambda t: t.display_info.name) + + return sorted_grouped + diff --git a/pyproject.toml b/pyproject.toml index 07c7eb7..d98dda3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "uvicorn[standard]>=0.32.0", "python-multipart>=0.0.12", "comfykit>=0.1.9", + "beautifulsoup4>=4.14.2", ] [project.optional-dependencies] diff --git a/templates/1080x1080/minimal_framed.html b/templates/1080x1080/image_minimal_framed.html similarity index 95% rename from templates/1080x1080/minimal_framed.html rename to templates/1080x1080/image_minimal_framed.html index 5e8f20a..99a7212 100644 --- a/templates/1080x1080/minimal_framed.html +++ b/templates/1080x1080/image_minimal_framed.html @@ -2,6 +2,8 @@ + + 极简边框风格 - 1080x1080 + + + +
+ +
+ +
+ + +
+ + +
+
{{title}}
+
+ + +
+
{{text}}
+
+ + + +
+ + \ No newline at end of file diff --git a/templates/1920x1080/film.html b/templates/1920x1080/image_film.html similarity index 97% rename from templates/1920x1080/film.html rename to templates/1920x1080/image_film.html index 917fa0c..a16bd41 100644 --- a/templates/1920x1080/film.html +++ b/templates/1920x1080/image_film.html @@ -2,6 +2,8 @@ + + 视频模板 - 电影风格