Files
AI-Video/pixelle_video/pipelines/standard.py
empty 3d3aba3670
Some checks failed
Deploy Documentation / deploy (push) Has been cancelled
feat: Add smart paragraph merging mode with AI grouping
- Add "smart" split mode that uses LLM to intelligently merge related paragraphs
- Implement two-step approach: analyze text structure, then group by semantic relevance
- Add paragraph_merging.py with analysis and grouping prompts
- Update UI to support smart mode selection with auto-detect hint
- Add i18n translations for smart mode (en_US, zh_CN)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 00:19:46 +08:00

559 lines
25 KiB
Python

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Standard Video Generation Pipeline
Standard workflow for generating short videos from topic or fixed script.
This is the default pipeline for general-purpose video generation.
Refactored to use LinearVideoPipeline (Template Method Pattern).
"""
from datetime import datetime
from pathlib import Path
from typing import Optional, Callable, Literal, List
import asyncio
import shutil
from loguru import logger
from pixelle_video.pipelines.linear import LinearVideoPipeline, PipelineContext
from pixelle_video.models.progress import ProgressEvent
from pixelle_video.models.storyboard import (
Storyboard,
StoryboardFrame,
StoryboardConfig,
ContentMetadata,
VideoGenerationResult
)
from pixelle_video.utils.content_generators import (
generate_title,
generate_narrations_from_topic,
split_narration_script,
generate_image_prompts,
)
from pixelle_video.utils.os_util import (
create_task_output_dir,
get_task_final_video_path
)
from pixelle_video.utils.template_util import get_template_type
from pixelle_video.utils.prompt_helper import build_image_prompt
from pixelle_video.services.video import VideoService
class StandardPipeline(LinearVideoPipeline):
"""
Standard video generation pipeline
Workflow:
1. Generate/determine title
2. Generate narrations (from topic or split fixed script)
3. Generate image prompts for each narration
4. For each frame:
- Generate audio (TTS)
- Generate image
- Compose frame with template
- Create video segment
5. Concatenate all segments
6. Add BGM (optional)
Supports two modes:
- "generate": LLM generates narrations from topic
- "fixed": Use provided script as-is (each line = one narration)
"""
# ==================== Lifecycle Methods ====================
async def setup_environment(self, ctx: PipelineContext):
"""Step 1: Setup task directory and environment."""
text = ctx.input_text
mode = ctx.params.get("mode", "generate")
logger.info(f"🚀 Starting StandardPipeline in '{mode}' mode")
logger.info(f" Text length: {len(text)} chars")
# Create isolated task directory
task_dir, task_id = create_task_output_dir()
ctx.task_id = task_id
ctx.task_dir = task_dir
logger.info(f"📁 Task directory created: {task_dir}")
logger.info(f" Task ID: {task_id}")
# Determine final video path
output_path = ctx.params.get("output_path")
if output_path is None:
ctx.final_video_path = get_task_final_video_path(task_id)
else:
# We will copy to this path in finalize/post_production
# For internal processing, we still use the task dir path?
# Actually StandardPipeline logic used get_task_final_video_path as the target for concat
# and then copied. Let's stick to that.
ctx.final_video_path = get_task_final_video_path(task_id)
logger.info(f" Will copy final video to: {output_path}")
async def generate_content(self, ctx: PipelineContext):
"""Step 2: Generate or process script/narrations."""
mode = ctx.params.get("mode", "generate")
text = ctx.input_text
n_scenes = ctx.params.get("n_scenes", 5)
min_words = ctx.params.get("min_narration_words", 5)
max_words = ctx.params.get("max_narration_words", 20)
if mode == "generate":
self._report_progress(ctx.progress_callback, "generating_narrations", 0.05)
ctx.narrations = await generate_narrations_from_topic(
self.llm,
topic=text,
n_scenes=n_scenes,
min_words=min_words,
max_words=max_words
)
logger.info(f"✅ Generated {len(ctx.narrations)} narrations")
else: # fixed
self._report_progress(ctx.progress_callback, "splitting_script", 0.05)
split_mode = ctx.params.get("split_mode", "paragraph")
target_segments = ctx.params.get("target_segments", 8)
ctx.narrations = await split_narration_script(
text,
split_mode=split_mode,
llm_service=self.llm if split_mode == "smart" else None,
target_segments=target_segments
)
logger.info(f"✅ Split script into {len(ctx.narrations)} segments (mode={split_mode})")
logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode")
async def determine_title(self, ctx: PipelineContext):
"""Step 3: Determine or generate video title."""
# Note: Swapped order with generate_content in base class call,
# but in StandardPipeline original code, title was determined BEFORE narrations.
# However, LinearVideoPipeline defines generate_content BEFORE determine_title.
# This is fine as they are independent in StandardPipeline logic.
title = ctx.params.get("title")
mode = ctx.params.get("mode", "generate")
text = ctx.input_text
if title:
ctx.title = title
logger.info(f" Title: '{title}' (user-specified)")
else:
self._report_progress(ctx.progress_callback, "generating_title", 0.01)
if mode == "generate":
ctx.title = await generate_title(self.llm, text, strategy="auto")
logger.info(f" Title: '{ctx.title}' (auto-generated)")
else: # fixed
ctx.title = await generate_title(self.llm, text, strategy="llm")
logger.info(f" Title: '{ctx.title}' (LLM-generated)")
async def plan_visuals(self, ctx: PipelineContext):
"""Step 4: Generate image prompts or visual descriptions."""
# Detect template type to determine if media generation is needed
frame_template = ctx.params.get("frame_template") or "1080x1920/default.html"
template_name = Path(frame_template).name
template_type = get_template_type(template_name)
template_requires_media = (template_type in ["image", "video"])
if template_type == "image":
logger.info(f"📸 Template requires image generation")
elif template_type == "video":
logger.info(f"🎬 Template requires video generation")
else: # static
logger.info(f"⚡ Static template - skipping media generation pipeline")
logger.info(f" 💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency")
# Only generate image prompts if template requires media
if template_requires_media:
self._report_progress(ctx.progress_callback, "generating_image_prompts", 0.15)
prompt_prefix = ctx.params.get("prompt_prefix")
min_words = ctx.params.get("min_image_prompt_words", 30)
max_words = ctx.params.get("max_image_prompt_words", 60)
# Auto-detect characters from narrations if CharacterMemory is enabled
character_memory = ctx.params.get("character_memory")
if character_memory and character_memory.config.auto_detect_characters:
logger.info("🔍 Auto-detecting characters from narrations...")
for i, narration in enumerate(ctx.narrations):
await character_memory.detect_characters_from_narration(narration, frame_index=i)
if character_memory.characters:
detected_names = [c.name for c in character_memory.characters]
logger.info(f"✅ Detected {len(character_memory.characters)} characters: {detected_names}")
# Override prompt_prefix if provided
original_prefix = None
if prompt_prefix is not None:
image_config = self.core.config.get("comfyui", {}).get("image", {})
original_prefix = image_config.get("prompt_prefix")
image_config["prompt_prefix"] = prompt_prefix
logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")
try:
# Create progress callback wrapper for image prompt generation
def image_prompt_progress(completed: int, total: int, message: str):
batch_progress = completed / total if total > 0 else 0
overall_progress = 0.15 + (batch_progress * 0.15)
self._report_progress(
ctx.progress_callback,
"generating_image_prompts",
overall_progress,
extra_info=message
)
# Generate base image prompts
base_image_prompts = await generate_image_prompts(
self.llm,
narrations=ctx.narrations,
min_words=min_words,
max_words=max_words,
progress_callback=image_prompt_progress
)
# Apply character memory enhancement (if available)
character_memory = ctx.params.get("character_memory")
if character_memory and character_memory.characters:
active_characters = [c for c in character_memory.characters if getattr(c, 'is_active', True)]
if active_characters:
logger.info(f"🎭 Applying {len(active_characters)} active characters to {len(base_image_prompts)} prompts")
base_image_prompts = [
character_memory.apply_to_prompt(p) for p in base_image_prompts
]
# Apply prompt prefix
image_config = self.core.config.get("comfyui", {}).get("image", {})
prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "")
ctx.image_prompts = []
for base_prompt in base_image_prompts:
final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use)
ctx.image_prompts.append(final_prompt)
finally:
# Restore original prompt_prefix
if original_prefix is not None:
image_config["prompt_prefix"] = original_prefix
logger.info(f"✅ Generated {len(ctx.image_prompts)} image prompts")
else:
# Static template - skip image prompt generation entirely
ctx.image_prompts = [None] * len(ctx.narrations)
logger.info(f"⚡ Skipped image prompt generation (static template)")
logger.info(f" 💡 Savings: {len(ctx.narrations)} LLM calls + {len(ctx.narrations)} media generations")
async def initialize_storyboard(self, ctx: PipelineContext):
"""Step 5: Create Storyboard object and frames."""
# === Handle TTS parameter compatibility ===
tts_inference_mode = ctx.params.get("tts_inference_mode")
tts_voice = ctx.params.get("tts_voice")
voice_id = ctx.params.get("voice_id")
tts_workflow = ctx.params.get("tts_workflow")
final_voice_id = None
final_tts_workflow = tts_workflow
if tts_inference_mode:
# New API from web UI
if tts_inference_mode == "local":
final_voice_id = tts_voice or "zh-CN-YunjianNeural"
final_tts_workflow = None
logger.debug(f"TTS Mode: local (voice={final_voice_id})")
elif tts_inference_mode == "comfyui":
final_voice_id = None
logger.debug(f"TTS Mode: comfyui (workflow={final_tts_workflow})")
else:
# Old API
final_voice_id = voice_id or tts_voice or "zh-CN-YunjianNeural"
logger.debug(f"TTS Mode: legacy (voice_id={final_voice_id}, workflow={final_tts_workflow})")
# Create config
ctx.config = StoryboardConfig(
task_id=ctx.task_id,
n_storyboard=len(ctx.narrations), # Use actual length
min_narration_words=ctx.params.get("min_narration_words", 5),
max_narration_words=ctx.params.get("max_narration_words", 20),
min_image_prompt_words=ctx.params.get("min_image_prompt_words", 30),
max_image_prompt_words=ctx.params.get("max_image_prompt_words", 60),
video_fps=ctx.params.get("video_fps", 30),
tts_inference_mode=tts_inference_mode or "local",
voice_id=final_voice_id,
tts_workflow=final_tts_workflow,
tts_speed=ctx.params.get("tts_speed", 1.2),
ref_audio=ctx.params.get("ref_audio"),
media_width=ctx.params.get("media_width"),
media_height=ctx.params.get("media_height"),
media_workflow=ctx.params.get("media_workflow"),
frame_template=ctx.params.get("frame_template") or "1080x1920/default.html",
template_params=ctx.params.get("template_params")
)
# Create storyboard
ctx.storyboard = Storyboard(
title=ctx.title,
config=ctx.config,
content_metadata=ctx.params.get("content_metadata"),
created_at=datetime.now()
)
# Create frames
for i, (narration, image_prompt) in enumerate(zip(ctx.narrations, ctx.image_prompts)):
frame = StoryboardFrame(
index=i,
narration=narration,
image_prompt=image_prompt,
created_at=datetime.now()
)
ctx.storyboard.frames.append(frame)
async def produce_assets(self, ctx: PipelineContext):
"""Step 6: Generate audio, images, and render frames (Core processing)."""
storyboard = ctx.storyboard
config = ctx.config
# Check if using RunningHub workflows for parallel processing
is_runninghub = (
(config.tts_workflow and config.tts_workflow.startswith("runninghub/")) or
(config.media_workflow and config.media_workflow.startswith("runninghub/"))
)
# Get concurrent limit from config_manager (supports hot reload without restart)
from pixelle_video.config import config_manager
runninghub_concurrent_limit = config_manager.config.comfyui.runninghub_concurrent_limit or 1
if is_runninghub and runninghub_concurrent_limit > 1:
logger.info(f"🚀 Using parallel processing for RunningHub workflows (max {runninghub_concurrent_limit} concurrent)")
semaphore = asyncio.Semaphore(runninghub_concurrent_limit)
completed_count = 0
async def process_frame_with_semaphore(i: int, frame: StoryboardFrame):
nonlocal completed_count
async with semaphore:
base_progress = 0.2
frame_range = 0.6
per_frame_progress = frame_range / len(storyboard.frames)
# Create frame-specific progress callback
def frame_progress_callback(event: ProgressEvent):
overall_progress = base_progress + (per_frame_progress * completed_count) + (per_frame_progress * event.progress)
if ctx.progress_callback:
adjusted_event = ProgressEvent(
event_type=event.event_type,
progress=overall_progress,
frame_current=i+1,
frame_total=len(storyboard.frames),
step=event.step,
action=event.action
)
ctx.progress_callback(adjusted_event)
# Report frame start
self._report_progress(
ctx.progress_callback,
"processing_frame",
base_progress + (per_frame_progress * completed_count),
frame_current=i+1,
frame_total=len(storyboard.frames)
)
processed_frame = await self.core.frame_processor(
frame=frame,
storyboard=storyboard,
config=config,
total_frames=len(storyboard.frames),
progress_callback=frame_progress_callback
)
completed_count += 1
logger.info(f"✅ Frame {i+1} completed ({processed_frame.duration:.2f}s) [{completed_count}/{len(storyboard.frames)}]")
return i, processed_frame
# Create all tasks and execute in parallel
tasks = [process_frame_with_semaphore(i, frame) for i, frame in enumerate(storyboard.frames)]
results = await asyncio.gather(*tasks)
# Update frames in order and calculate total duration
for idx, processed_frame in sorted(results, key=lambda x: x[0]):
storyboard.frames[idx] = processed_frame
storyboard.total_duration += processed_frame.duration
logger.info(f"✅ All frames processed in parallel (total duration: {storyboard.total_duration:.2f}s)")
else:
# Serial processing for non-RunningHub workflows
logger.info("⚙️ Using serial processing (non-RunningHub workflow)")
for i, frame in enumerate(storyboard.frames):
base_progress = 0.2
frame_range = 0.6
per_frame_progress = frame_range / len(storyboard.frames)
# Create frame-specific progress callback
def frame_progress_callback(event: ProgressEvent):
overall_progress = base_progress + (per_frame_progress * i) + (per_frame_progress * event.progress)
if ctx.progress_callback:
adjusted_event = ProgressEvent(
event_type=event.event_type,
progress=overall_progress,
frame_current=event.frame_current,
frame_total=event.frame_total,
step=event.step,
action=event.action
)
ctx.progress_callback(adjusted_event)
# Report frame start
self._report_progress(
ctx.progress_callback,
"processing_frame",
base_progress + (per_frame_progress * i),
frame_current=i+1,
frame_total=len(storyboard.frames)
)
processed_frame = await self.core.frame_processor(
frame=frame,
storyboard=storyboard,
config=config,
total_frames=len(storyboard.frames),
progress_callback=frame_progress_callback
)
storyboard.total_duration += processed_frame.duration
logger.info(f"✅ Frame {i+1} completed ({processed_frame.duration:.2f}s)")
async def post_production(self, ctx: PipelineContext):
"""Step 7: Concatenate videos and add BGM."""
self._report_progress(ctx.progress_callback, "concatenating", 0.85)
storyboard = ctx.storyboard
segment_paths = [frame.video_segment_path for frame in storyboard.frames]
video_service = VideoService()
final_video_path = video_service.concat_videos(
videos=segment_paths,
output=ctx.final_video_path,
bgm_path=ctx.params.get("bgm_path"),
bgm_volume=ctx.params.get("bgm_volume", 0.2),
bgm_mode=ctx.params.get("bgm_mode", "loop")
)
storyboard.final_video_path = final_video_path
storyboard.completed_at = datetime.now()
# Copy to user-specified path if provided
user_specified_output = ctx.params.get("output_path")
if user_specified_output:
Path(user_specified_output).parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(final_video_path, user_specified_output)
logger.info(f"📹 Final video copied to: {user_specified_output}")
ctx.final_video_path = user_specified_output
storyboard.final_video_path = user_specified_output
logger.success(f"🎬 Video generation completed: {ctx.final_video_path}")
async def finalize(self, ctx: PipelineContext) -> VideoGenerationResult:
"""Step 8: Create result object and persist metadata."""
self._report_progress(ctx.progress_callback, "completed", 1.0)
video_path_obj = Path(ctx.final_video_path)
file_size = video_path_obj.stat().st_size
result = VideoGenerationResult(
video_path=ctx.final_video_path,
storyboard=ctx.storyboard,
duration=ctx.storyboard.total_duration,
file_size=file_size
)
ctx.result = result
logger.info(f"✅ Generated video: {ctx.final_video_path}")
logger.info(f" Duration: {ctx.storyboard.total_duration:.2f}s")
logger.info(f" Size: {file_size / (1024*1024):.2f} MB")
logger.info(f" Frames: {len(ctx.storyboard.frames)}")
# Persist metadata
await self._persist_task_data(ctx)
return result
async def _persist_task_data(self, ctx: PipelineContext):
"""
Persist task metadata and storyboard to filesystem
"""
try:
storyboard = ctx.storyboard
result = ctx.result
task_id = storyboard.config.task_id
if not task_id:
logger.warning("No task_id in storyboard, skipping persistence")
return
# Build metadata - filter out non-serializable objects
clean_input = {}
for key, value in ctx.params.items():
# Skip non-serializable objects like CharacterMemory
if key == "character_memory":
# Convert to serializable dict if present
if value is not None and hasattr(value, 'to_dict'):
clean_input["character_memory"] = value.to_dict()
elif key == "progress_callback":
# Skip callback functions
continue
elif callable(value):
# Skip any callable objects
continue
else:
clean_input[key] = value
clean_input["text"] = ctx.input_text # Ensure text is included
if not clean_input.get("title"):
clean_input["title"] = storyboard.title
metadata = {
"task_id": task_id,
"created_at": storyboard.created_at.isoformat() if storyboard.created_at else None,
"completed_at": storyboard.completed_at.isoformat() if storyboard.completed_at else None,
"status": "completed",
"input": clean_input,
"result": {
"video_path": result.video_path,
"duration": result.duration,
"file_size": result.file_size,
"n_frames": len(storyboard.frames)
},
"config": {
"llm_model": self.core.config.get("llm", {}).get("model", "unknown"),
"llm_base_url": self.core.config.get("llm", {}).get("base_url", "unknown"),
"comfyui_url": self.core.config.get("comfyui", {}).get("comfyui_url", "unknown"),
"runninghub_enabled": bool(self.core.config.get("comfyui", {}).get("runninghub_api_key")),
}
}
# Save metadata
await self.core.persistence.save_task_metadata(task_id, metadata)
logger.info(f"💾 Saved task metadata: {task_id}")
# Save storyboard
await self.core.persistence.save_storyboard(task_id, storyboard)
logger.info(f"💾 Saved storyboard: {task_id}")
except Exception as e:
logger.error(f"Failed to persist task data: {e}")
# Don't raise - persistence failure shouldn't break video generation