diff --git a/pixelle_video/pipelines/custom.py b/pixelle_video/pipelines/custom.py index f31ca07..0030214 100644 --- a/pixelle_video/pipelines/custom.py +++ b/pixelle_video/pipelines/custom.py @@ -86,7 +86,9 @@ class CustomPipeline(BasePipeline): custom_param_example: str = "default_value", # === Standard Parameters (keep these for compatibility) === - voice_id: str = "[Chinese] zh-CN Yunjian", + tts_inference_mode: Optional[str] = None, # "local" or "comfyui" + voice_id: Optional[str] = None, # Deprecated, use tts_voice + tts_voice: Optional[str] = None, # Voice ID for local mode tts_workflow: Optional[str] = None, tts_speed: float = 1.2, ref_audio: Optional[str] = None, @@ -126,6 +128,29 @@ class CustomPipeline(BasePipeline): logger.info(f"Input text length: {len(text)} chars") logger.info(f"Custom parameter: {custom_param_example}") + # === Handle TTS parameter compatibility === + # Support both old API (voice_id) and new API (tts_inference_mode + tts_voice) + final_voice_id = None + final_tts_workflow = tts_workflow + + if tts_inference_mode: + # New API from web UI + if tts_inference_mode == "local": + # Local Edge TTS mode - use tts_voice + final_voice_id = tts_voice or "zh-CN-YunjianNeural" + final_tts_workflow = None # Don't use workflow in local mode + logger.debug(f"TTS Mode: local (voice={final_voice_id})") + elif tts_inference_mode == "comfyui": + # ComfyUI workflow mode + final_voice_id = None # Don't use voice_id in ComfyUI mode + # tts_workflow already set from parameter + logger.debug(f"TTS Mode: comfyui (workflow={final_tts_workflow})") + else: + # Old API (backward compatibility) + final_voice_id = voice_id or tts_voice or "zh-CN-YunjianNeural" + # tts_workflow already set from parameter + logger.debug(f"TTS Mode: legacy (voice_id={final_voice_id}, workflow={final_tts_workflow})") + # ========== Step 0: Setup ========== self._report_progress(progress_callback, "initializing", 0.05) @@ -240,8 +265,9 @@ class CustomPipeline(BasePipeline): min_image_prompt_words=30, max_image_prompt_words=60, video_fps=video_fps, - voice_id=voice_id, - tts_workflow=tts_workflow, + tts_inference_mode=tts_inference_mode or "local", # TTS inference mode (CRITICAL FIX) + voice_id=final_voice_id, # Use processed voice_id + tts_workflow=final_tts_workflow, # Use processed workflow tts_speed=tts_speed, ref_audio=ref_audio, image_width=image_width, diff --git a/pixelle_video/pipelines/standard.py b/pixelle_video/pipelines/standard.py index 972ed7f..aafcb4f 100644 --- a/pixelle_video/pipelines/standard.py +++ b/pixelle_video/pipelines/standard.py @@ -251,6 +251,7 @@ class StandardPipeline(BasePipeline): min_image_prompt_words=min_image_prompt_words, max_image_prompt_words=max_image_prompt_words, video_fps=video_fps, + tts_inference_mode=tts_inference_mode or "local", # TTS inference mode (CRITICAL FIX) voice_id=final_voice_id, # Use processed voice_id tts_workflow=final_tts_workflow, # Use processed workflow tts_speed=tts_speed, @@ -288,54 +289,77 @@ class StandardPipeline(BasePipeline): logger.info(f"✅ Split script into {len(narrations)} segments (by lines)") logger.info(f" Note: n_scenes={n_scenes} is ignored in fixed mode") - # ========== Step 2: Generate image prompts ========== - self._report_progress(progress_callback, "generating_image_prompts", 0.15) + # ========== Step 2: Check template type and conditionally generate image prompts ========== + # Detect template type to determine if media generation is needed + from pathlib import Path + from pixelle_video.utils.template_util import get_template_type - # Override prompt_prefix if provided - original_prefix = None - if prompt_prefix is not None: - image_config = self.core.config.get("comfyui", {}).get("image", {}) - original_prefix = image_config.get("prompt_prefix") - image_config["prompt_prefix"] = prompt_prefix - logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'") + template_name = Path(config.frame_template).name + template_type = get_template_type(template_name) + template_requires_media = (template_type in ["image", "video"]) - try: - # Create progress callback wrapper for image prompt generation - def image_prompt_progress(completed: int, total: int, message: str): - batch_progress = completed / total if total > 0 else 0 - overall_progress = 0.15 + (batch_progress * 0.15) - self._report_progress( - progress_callback, - "generating_image_prompts", - overall_progress, - extra_info=message + if template_type == "image": + logger.info(f"📸 Template requires image generation") + elif template_type == "video": + logger.info(f"🎬 Template requires video generation") + else: # static + logger.info(f"⚡ Static template - skipping media generation pipeline") + logger.info(f" 💡 Benefits: Faster generation + Lower cost + No ComfyUI dependency") + + # Only generate image prompts if template requires media + if template_requires_media: + self._report_progress(progress_callback, "generating_image_prompts", 0.15) + + # Override prompt_prefix if provided + original_prefix = None + if prompt_prefix is not None: + image_config = self.core.config.get("comfyui", {}).get("image", {}) + original_prefix = image_config.get("prompt_prefix") + image_config["prompt_prefix"] = prompt_prefix + logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'") + + try: + # Create progress callback wrapper for image prompt generation + def image_prompt_progress(completed: int, total: int, message: str): + batch_progress = completed / total if total > 0 else 0 + overall_progress = 0.15 + (batch_progress * 0.15) + self._report_progress( + progress_callback, + "generating_image_prompts", + overall_progress, + extra_info=message + ) + + # Generate base image prompts + base_image_prompts = await generate_image_prompts( + self.llm, + narrations=narrations, + min_words=min_image_prompt_words, + max_words=max_image_prompt_words, + progress_callback=image_prompt_progress ) + + # Apply prompt prefix + from pixelle_video.utils.prompt_helper import build_image_prompt + image_config = self.core.config.get("comfyui", {}).get("image", {}) + prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "") + + image_prompts = [] + for base_prompt in base_image_prompts: + final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use) + image_prompts.append(final_prompt) + + finally: + # Restore original prompt_prefix + if original_prefix is not None: + image_config["prompt_prefix"] = original_prefix - # Generate base image prompts - base_image_prompts = await generate_image_prompts( - self.llm, - narrations=narrations, - min_words=min_image_prompt_words, - max_words=max_image_prompt_words, - progress_callback=image_prompt_progress - ) - - # Apply prompt prefix - from pixelle_video.utils.prompt_helper import build_image_prompt - image_config = self.core.config.get("comfyui", {}).get("image", {}) - prompt_prefix_to_use = prompt_prefix if prompt_prefix is not None else image_config.get("prompt_prefix", "") - - image_prompts = [] - for base_prompt in base_image_prompts: - final_prompt = build_image_prompt(base_prompt, prompt_prefix_to_use) - image_prompts.append(final_prompt) - - finally: - # Restore original prompt_prefix - if original_prefix is not None: - image_config["prompt_prefix"] = original_prefix - - logger.info(f"✅ Generated {len(image_prompts)} image prompts") + logger.info(f"✅ Generated {len(image_prompts)} image prompts") + else: + # Static template - skip image prompt generation entirely + image_prompts = [None] * len(narrations) + logger.info(f"⚡ Skipped image prompt generation (static template)") + logger.info(f" 💡 Savings: {len(narrations)} LLM calls + {len(narrations)} media generations") # ========== Step 3: Create frames ========== for i, (narration, image_prompt) in enumerate(zip(narrations, image_prompts)):