tts支持本地合成

2025-11-06 21:06:14 +08:00
parent 56b6b74af7
commit 393cdb8f0a
9 changed files with 531 additions and 112 deletions
--- a/pixelle_video/config/schema.py
+++ b/pixelle_video/config/schema.py
@@ -14,9 +14,28 @@ class LLMConfig(BaseModel):
    model: str = Field(default="", description="LLM Model Name")
 class TTSLocalConfig(BaseModel):
    """Local TTS configuration (Edge TTS)"""
    voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID")
    speed: float = Field(default=1.2, ge=0.5, le=2.0, description="Speech speed multiplier (0.5-2.0)")
 class TTSComfyUIConfig(BaseModel):
    """ComfyUI TTS configuration"""
    default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
 class TTSSubConfig(BaseModel):
    """TTS-specific configuration (under comfyui.tts)"""
-    default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
+    inference_mode: str = Field(default="local", description="TTS inference mode: 'local' or 'comfyui'")
    local: TTSLocalConfig = Field(default_factory=TTSLocalConfig, description="Local TTS (Edge TTS) configuration")
    comfyui: TTSComfyUIConfig = Field(default_factory=TTSComfyUIConfig, description="ComfyUI TTS configuration")
    # Backward compatibility: keep default_workflow at top level
    @property
    def default_workflow(self) -> Optional[str]:
        """Get default workflow (for backward compatibility)"""
        return self.comfyui.default_workflow
 class ImageSubConfig(BaseModel):
--- a/pixelle_video/models/storyboard.py
+++ b/pixelle_video/models/storyboard.py
@@ -24,10 +24,11 @@ class StoryboardConfig:
    video_fps: int = 30                        # Frame rate
    # Audio parameters
-    voice_id: str = "[Chinese] zh-CN Yunjian"     # Default voice
+    tts_inference_mode: str = "local"          # TTS inference mode: "local" or "comfyui"
-    tts_workflow: Optional[str] = None         # TTS workflow filename (None = use default)
+    voice_id: Optional[str] = None             # Voice ID (for local: Edge TTS voice ID; for comfyui: workflow-specific)
-    tts_speed: float = 1.2                     # TTS speed multiplier (1.0 = normal, >1.0 = faster)
+    tts_workflow: Optional[str] = None         # TTS workflow filename (for ComfyUI mode, None = use default)
-    ref_audio: Optional[str] = None            # Reference audio for voice cloning (only some workflows support this)
+    tts_speed: Optional[float] = None          # TTS speed multiplier (0.5-2.0, 1.0 = normal)
    ref_audio: Optional[str] = None            # Reference audio for voice cloning (ComfyUI mode only)
    # Image parameters
    image_width: int = 1024
--- a/pixelle_video/pipelines/standard.py
+++ b/pixelle_video/pipelines/standard.py
@@ -62,10 +62,17 @@ class StandardPipeline(BasePipeline):
        # === Basic Config ===
        n_scenes: int = 5,  # Only used in generate mode; ignored in fixed mode
-        voice_id: str = "[Chinese] zh-CN Yunjian",
+        
-        tts_workflow: Optional[str] = None,
+        # === TTS Parameters ===
-        tts_speed: float = 1.2,
+        tts_inference_mode: Optional[str] = None,  # "local" or "comfyui"
-        ref_audio: Optional[str] = None,  # Reference audio for voice cloning
+        tts_voice: Optional[str] = None,  # For local mode: Edge TTS voice ID
        tts_speed: Optional[float] = None,  # Speed multiplier (0.5-2.0)
        tts_workflow: Optional[str] = None,  # For ComfyUI mode: workflow path
        ref_audio: Optional[str] = None,  # For ComfyUI mode: reference audio
        # Deprecated (kept for backward compatibility)
        voice_id: Optional[str] = None,
        output_path: Optional[str] = None,
        # === LLM Parameters ===
@@ -191,6 +198,29 @@ class StandardPipeline(BasePipeline):
            output_path = get_task_final_video_path(task_id)
            logger.info(f"   Will copy final video to: {user_specified_output}")
        # Determine TTS inference mode and parameters
        # Priority: explicit params > backward compatibility > config defaults
        if tts_inference_mode is None:
            # Check if user provided ComfyUI-specific params
            if tts_workflow is not None or ref_audio is not None:
                tts_inference_mode = "comfyui"
            # Check if user provided old voice_id param (backward compatibility)
            elif voice_id is not None:
                tts_inference_mode = "comfyui"
                if tts_voice is None:
                    tts_voice = voice_id
            else:
                # Use config default
                tts_config = self.core.config.get("comfyui", {}).get("tts", {})
                tts_inference_mode = tts_config.get("inference_mode", "local")
        # Set voice_id based on mode for StoryboardConfig
        final_voice_id = None
        if tts_inference_mode == "local":
            final_voice_id = tts_voice or voice_id
        else:  # comfyui
            final_voice_id = voice_id  # For ComfyUI, might be None
        # Create storyboard config
        config = StoryboardConfig(
            task_id=task_id,
@@ -200,7 +230,8 @@ class StandardPipeline(BasePipeline):
            min_image_prompt_words=min_image_prompt_words,
            max_image_prompt_words=max_image_prompt_words,
            video_fps=video_fps,
-            voice_id=voice_id,
+            tts_inference_mode=tts_inference_mode,
            voice_id=final_voice_id,
            tts_workflow=tts_workflow,
            tts_speed=tts_speed,
            ref_audio=ref_audio,
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -124,18 +124,29 @@ class FrameProcessor:
        from pixelle_video.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "audio")
-        # Call TTS with specific output path and workflow
+        # Build TTS params based on inference mode
        tts_params = {
            "text": frame.narration,
-            "workflow": config.tts_workflow,
+            "inference_mode": config.tts_inference_mode,
            "voice": config.voice_id,
            "speed": config.tts_speed,
            "output_path": output_path,
        }
-        # Add ref_audio if provided
+        if config.tts_inference_mode == "local":
-        if config.ref_audio:
+            # Local mode: pass voice and speed
-            tts_params["ref_audio"] = config.ref_audio
+            if config.voice_id:
                tts_params["voice"] = config.voice_id
            if config.tts_speed is not None:
                tts_params["speed"] = config.tts_speed
        else:  # comfyui
            # ComfyUI mode: pass workflow, voice, speed, and ref_audio
            if config.tts_workflow:
                tts_params["workflow"] = config.tts_workflow
            if config.voice_id:
                tts_params["voice"] = config.voice_id
            if config.tts_speed is not None:
                tts_params["speed"] = config.tts_speed
            if config.ref_audio:
                tts_params["ref_audio"] = config.ref_audio
        audio_path = await self.core.tts(**tts_params)
--- a/pixelle_video/services/tts_service.py
+++ b/pixelle_video/services/tts_service.py
@@ -1,13 +1,18 @@
 """
-TTS (Text-to-Speech) Service - ComfyUI Workflow-based implementation
+TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
 """
 import os
 import uuid
 from pathlib import Path
 from typing import Optional
 from comfykit import ComfyKit
 from loguru import logger
 from pixelle_video.services.comfy_base_service import ComfyBaseService
 from pixelle_video.utils.tts_util import edge_tts
 from pixelle_video.tts_voices import speed_to_rate
 class TTSService(ComfyBaseService):
@@ -52,22 +57,25 @@ class TTSService(ComfyBaseService):
        comfyui_url: Optional[str] = None,
        runninghub_api_key: Optional[str] = None,
        # TTS parameters
-        voice: str = "[Chinese] zh-CN Yunjian",
+        voice: Optional[str] = None,
-        speed: float = 1.2,
+        speed: Optional[float] = None,
        # Inference mode override
        inference_mode: Optional[str] = None,
        # Output path
        output_path: Optional[str] = None,
        **params
    ) -> str:
        """
-        Generate speech using ComfyUI workflow
+        Generate speech using local Edge TTS or ComfyUI workflow
        Args:
            text: Text to convert to speech
-            workflow: Workflow filename (default: from config)
+            workflow: Workflow filename (for ComfyUI mode, default: from config)
            comfyui_url: ComfyUI URL (optional, overrides config)
            runninghub_api_key: RunningHub API key (optional, overrides config)
-            voice: Voice ID (workflow-specific)
+            voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
            speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
            inference_mode: Override inference mode ("local" or "comfyui", default: from config)
            output_path: Custom output path (auto-generated if None)
            **params: Additional workflow parameters
@@ -75,49 +83,103 @@ class TTSService(ComfyBaseService):
            Generated audio file path
        Examples:
-            # Simplest: use default workflow
+            # Local inference (Edge TTS)
            audio_path = await pixelle_video.tts(text="Hello, world!")
            # Use specific workflow
            audio_path = await pixelle_video.tts(
-                text="你好，世界！",
+                text="Hello, world!",
-                workflow="tts_edge.json"
+                inference_mode="local",
-            )
+                voice="zh-CN-YunjianNeural",
            # With voice and speed
            audio_path = await pixelle_video.tts(
                text="Hello",
                workflow="tts_edge.json",
                voice="[Chinese] zh-CN Xiaoxiao",
                speed=1.2
            )
-            # With absolute path
+            # ComfyUI inference
            audio_path = await pixelle_video.tts(
-                text="Hello",
+                text="你好，世界！",
-                workflow="/path/to/custom_tts.json"
+                inference_mode="comfyui",
-            )
+                workflow="runninghub/tts_edge.json"
            # With custom ComfyUI server
            audio_path = await pixelle_video.tts(
                text="Hello",
                comfyui_url="http://192.168.1.100:8188"
            )
        """
-        # 1. Resolve workflow (returns structured info)
+        # Determine inference mode (param > config)
-        workflow_info = self._resolve_workflow(workflow=workflow)
+        mode = inference_mode or self.config.get("inference_mode", "local")
-        # 2. Execute ComfyUI workflow
+        # Route to appropriate implementation
-        return await self._call_comfyui_workflow(
+        if mode == "local":
-            workflow_info=workflow_info,
+            return await self._call_local_tts(
-            text=text,
+                text=text,
-            comfyui_url=comfyui_url,
+                voice=voice,
-            runninghub_api_key=runninghub_api_key,
+                speed=speed,
-            voice=voice,
+                output_path=output_path
-            speed=speed,
+            )
-            output_path=output_path,
+        else:  # comfyui
-            **params
+            # 1. Resolve workflow (returns structured info)
-        )
+            workflow_info = self._resolve_workflow(workflow=workflow)
            # 2. Execute ComfyUI workflow
            return await self._call_comfyui_workflow(
                workflow_info=workflow_info,
                text=text,
                comfyui_url=comfyui_url,
                runninghub_api_key=runninghub_api_key,
                voice=voice,
                speed=speed,
                output_path=output_path,
                **params
            )
    async def _call_local_tts(
        self,
        text: str,
        voice: Optional[str] = None,
        speed: Optional[float] = None,
        output_path: Optional[str] = None,
    ) -> str:
        """
        Generate speech using local Edge TTS
        Args:
            text: Text to convert to speech
            voice: Edge TTS voice ID (default: from config)
            speed: Speech speed multiplier (default: from config)
            output_path: Custom output path (auto-generated if None)
        Returns:
            Generated audio file path
        """
        # Get config defaults
        local_config = self.config.get("local", {})
        # Determine voice and speed (param > config)
        final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
        final_speed = speed if speed is not None else local_config.get("speed", 1.2)
        # Convert speed to rate parameter
        rate = speed_to_rate(final_speed)
        logger.info(f"🎙️  Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
        # Generate output path if not provided
        if not output_path:
            # Generate unique filename
            unique_id = uuid.uuid4().hex
            output_path = f"output/{unique_id}.mp3"
            # Ensure output directory exists
            Path("output").mkdir(parents=True, exist_ok=True)
        # Call Edge TTS
        try:
            audio_bytes = await edge_tts(
                text=text,
                voice=final_voice,
                rate=rate,
                output_path=output_path
            )
            logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
            return output_path
        except Exception as e:
            logger.error(f"Local TTS generation error: {e}")
            raise
    async def _call_comfyui_workflow(
        self,
--- a/pixelle_video/tts_voices.py
+++ b/pixelle_video/tts_voices.py
@@ -0,0 +1,147 @@
 """
 TTS Voice Configuration
 Defines available voices for local Edge TTS inference.
 """
 from typing import List, Dict, Any
 # Edge TTS voice presets for local inference
 EDGE_TTS_VOICES: List[Dict[str, Any]] = [
    # Chinese voices
    {
        "id": "zh-CN-XiaoxiaoNeural",
        "label_key": "tts.voice.zh_CN_XiaoxiaoNeural",
        "locale": "zh-CN",
        "gender": "female"
    },
    {
        "id": "zh-CN-XiaoyiNeural",
        "label_key": "tts.voice.zh_CN_XiaoyiNeural",
        "locale": "zh-CN",
        "gender": "female"
    },
    {
        "id": "zh-CN-YunjianNeural",
        "label_key": "tts.voice.zh_CN_YunjianNeural",
        "locale": "zh-CN",
        "gender": "male"
    },
    {
        "id": "zh-CN-YunxiNeural",
        "label_key": "tts.voice.zh_CN_YunxiNeural",
        "locale": "zh-CN",
        "gender": "male"
    },
    {
        "id": "zh-CN-YunyangNeural",
        "label_key": "tts.voice.zh_CN_YunyangNeural",
        "locale": "zh-CN",
        "gender": "male"
    },
    {
        "id": "zh-CN-YunyeNeural",
        "label_key": "tts.voice.zh_CN_YunyeNeural",
        "locale": "zh-CN",
        "gender": "male"
    },
    {
        "id": "zh-CN-YunfengNeural",
        "label_key": "tts.voice.zh_CN_YunfengNeural",
        "locale": "zh-CN",
        "gender": "male"
    },
    {
        "id": "zh-CN-liaoning-XiaobeiNeural",
        "label_key": "tts.voice.zh_CN_liaoning_XiaobeiNeural",
        "locale": "zh-CN",
        "gender": "female"
    },
    # English voices
    {
        "id": "en-US-AriaNeural",
        "label_key": "tts.voice.en_US_AriaNeural",
        "locale": "en-US",
        "gender": "female"
    },
    {
        "id": "en-US-JennyNeural",
        "label_key": "tts.voice.en_US_JennyNeural",
        "locale": "en-US",
        "gender": "female"
    },
    {
        "id": "en-US-GuyNeural",
        "label_key": "tts.voice.en_US_GuyNeural",
        "locale": "en-US",
        "gender": "male"
    },
    {
        "id": "en-US-DavisNeural",
        "label_key": "tts.voice.en_US_DavisNeural",
        "locale": "en-US",
        "gender": "male"
    },
    {
        "id": "en-GB-SoniaNeural",
        "label_key": "tts.voice.en_GB_SoniaNeural",
        "locale": "en-GB",
        "gender": "female"
    },
    {
        "id": "en-GB-RyanNeural",
        "label_key": "tts.voice.en_GB_RyanNeural",
        "locale": "en-GB",
        "gender": "male"
    },
 ]
 def get_voice_display_name(voice_id: str, tr_func=None, locale: str = "zh_CN") -> str:
    """
    Get display name for voice
    Args:
        voice_id: Voice ID (e.g., "zh-CN-YunjianNeural")
        tr_func: Translation function (optional)
        locale: Current locale (default: "zh_CN")
    Returns:
        Display name (translated label if in Chinese, otherwise voice ID)
    """
    # Find voice config
    voice_config = next((v for v in EDGE_TTS_VOICES if v["id"] == voice_id), None)
    if not voice_config:
        return voice_id
    # If Chinese locale and translation function available, use translated label
    if locale == "zh_CN" and tr_func:
        label_key = voice_config["label_key"]
        return tr_func(label_key)
    # For other locales, return voice ID
    return voice_id
 def speed_to_rate(speed: float) -> str:
    """
    Convert speed multiplier to Edge TTS rate parameter
    Args:
        speed: Speed multiplier (1.0 = normal, 1.2 = 120%)
    Returns:
        Rate string (e.g., "+20%", "-10%")
    Examples:
        1.0 → "+0%"
        1.2 → "+20%"
        0.8 → "-20%"
    """
    percentage = int((speed - 1.0) * 100)
    sign = "+" if percentage >= 0 else ""
    return f"{sign}{percentage}%"
--- a/web/app.py
+++ b/web/app.py
@@ -449,58 +449,146 @@ def main():
                st.markdown(f"**{tr('help.how')}**")
                st.markdown(tr("tts.how"))
-            # Get available TTS workflows
+            # Get TTS config
            tts_workflows = pixelle_video.tts.list_workflows()
            # Build options for selectbox
            tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
            tts_workflow_keys = [wf["key"] for wf in tts_workflows]
            # Default to saved workflow if exists
            default_tts_index = 0
            comfyui_config = config_manager.get_comfyui_config()
-            saved_tts_workflow = comfyui_config["tts"]["default_workflow"]
+            tts_config = comfyui_config["tts"]
            if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
                default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
-            tts_workflow_display = st.selectbox(
+            # Inference mode selection
-                "TTS Workflow",
+            tts_mode = st.radio(
-                tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
+                tr("tts.inference_mode"),
-                index=default_tts_index,
+                ["local", "comfyui"],
-                label_visibility="collapsed",
+                horizontal=True,
-                key="tts_workflow_select"
+                format_func=lambda x: tr(f"tts.mode.{x}"),
                index=0 if tts_config.get("inference_mode", "local") == "local" else 1,
                key="tts_inference_mode"
            )
-            # Get the actual workflow key
+            # Show hint based on mode
-            if tts_workflow_options:
+            if tts_mode == "local":
-                tts_selected_index = tts_workflow_options.index(tts_workflow_display)
+                st.caption(tr("tts.mode.local_hint"))
                tts_workflow_key = tts_workflow_keys[tts_selected_index]
            else:
-                tts_workflow_key = "selfhost/tts_edge.json"  # fallback
+                st.caption(tr("tts.mode.comfyui_hint"))
-            # Reference audio upload (optional, for voice cloning)
+            # ================================================================
-            ref_audio_file = st.file_uploader(
+            # Local Mode UI
-                tr("tts.ref_audio"),
+            # ================================================================
-                type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
+            if tts_mode == "local":
-                help=tr("tts.ref_audio_help"),
+                # Import voice configuration
-                key="ref_audio_upload"
+                from pixelle_video.tts_voices import EDGE_TTS_VOICES, get_voice_display_name
            )
            # Save uploaded ref_audio to temp file if provided
            ref_audio_path = None
            if ref_audio_file is not None:
                # Audio preview player (directly play uploaded file)
                st.audio(ref_audio_file)
-                # Save to temp directory
+                # Get saved voice from config
-                import tempfile
+                local_config = tts_config.get("local", {})
-                temp_dir = Path("temp")
+                saved_voice = local_config.get("voice", "zh-CN-YunjianNeural")
-                temp_dir.mkdir(exist_ok=True)
+                saved_speed = local_config.get("speed", 1.2)
-                ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
+                
-                with open(ref_audio_path, "wb") as f:
+                # Build voice options with i18n
-                    f.write(ref_audio_file.getbuffer())
+                voice_options = []
                voice_ids = []
                default_voice_index = 0
                for idx, voice_config in enumerate(EDGE_TTS_VOICES):
                    voice_id = voice_config["id"]
                    display_name = get_voice_display_name(voice_id, tr, get_language())
                    voice_options.append(display_name)
                    voice_ids.append(voice_id)
                    # Set default index if matches saved voice
                    if voice_id == saved_voice:
                        default_voice_index = idx
                # Two-column layout: Voice | Speed
                voice_col, speed_col = st.columns([1, 1])
                with voice_col:
                    # Voice selector
                    selected_voice_display = st.selectbox(
                        tr("tts.voice_selector"),
                        voice_options,
                        index=default_voice_index,
                        key="tts_local_voice"
                    )
                    # Get actual voice ID
                    selected_voice_index = voice_options.index(selected_voice_display)
                    selected_voice = voice_ids[selected_voice_index]
                with speed_col:
                    # Speed slider
                    tts_speed = st.slider(
                        tr("tts.speed"),
                        min_value=0.5,
                        max_value=2.0,
                        value=saved_speed,
                        step=0.1,
                        format="%.1fx",
                        key="tts_local_speed"
                    )
                    st.caption(tr("tts.speed_label", speed=f"{tts_speed:.1f}"))
                # Variables for video generation
                tts_workflow_key = None
                ref_audio_path = None
-            # TTS preview expander (simplified, uses default voice and speed)
+            # ================================================================
            # ComfyUI Mode UI
            # ================================================================
            else:  # comfyui mode
                # Get available TTS workflows
                tts_workflows = pixelle_video.tts.list_workflows()
                # Build options for selectbox
                tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
                tts_workflow_keys = [wf["key"] for wf in tts_workflows]
                # Default to saved workflow if exists
                default_tts_index = 0
                saved_tts_workflow = tts_config.get("comfyui", {}).get("default_workflow")
                if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
                    default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
                tts_workflow_display = st.selectbox(
                    "TTS Workflow",
                    tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
                    index=default_tts_index,
                    label_visibility="collapsed",
                    key="tts_workflow_select"
                )
                # Get the actual workflow key
                if tts_workflow_options:
                    tts_selected_index = tts_workflow_options.index(tts_workflow_display)
                    tts_workflow_key = tts_workflow_keys[tts_selected_index]
                else:
                    tts_workflow_key = "selfhost/tts_edge.json"  # fallback
                # Reference audio upload (optional, for voice cloning)
                ref_audio_file = st.file_uploader(
                    tr("tts.ref_audio"),
                    type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
                    help=tr("tts.ref_audio_help"),
                    key="ref_audio_upload"
                )
                # Save uploaded ref_audio to temp file if provided
                ref_audio_path = None
                if ref_audio_file is not None:
                    # Audio preview player (directly play uploaded file)
                    st.audio(ref_audio_file)
                    # Save to temp directory
                    temp_dir = Path("temp")
                    temp_dir.mkdir(exist_ok=True)
                    ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
                    with open(ref_audio_path, "wb") as f:
                        f.write(ref_audio_file.getbuffer())
                # Variables for video generation
                selected_voice = None
                tts_speed = None
            # ================================================================
            # TTS Preview (works for both modes)
            # ================================================================
            with st.expander(tr("tts.preview_title"), expanded=False):
                # Preview text input
                preview_text = st.text_input(
@@ -514,14 +602,19 @@ def main():
                if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
                    with st.spinner(tr("tts.previewing")):
                        try:
-                            # Generate preview audio using selected workflow (use default voice and speed)
+                            # Build TTS params based on mode
                            # Pass ref_audio if uploaded
                            tts_params = {
                                "text": preview_text,
-                                "workflow": tts_workflow_key
+                                "inference_mode": tts_mode
                            }
-                            if ref_audio_path:
+                            
-                                tts_params["ref_audio"] = str(ref_audio_path)
+                            if tts_mode == "local":
                                tts_params["voice"] = selected_voice
                                tts_params["speed"] = tts_speed
                            else:  # comfyui
                                tts_params["workflow"] = tts_workflow_key
                                if ref_audio_path:
                                    tts_params["ref_audio"] = str(ref_audio_path)
                            audio_path = run_async(pixelle_video.tts(**tts_params))
@@ -979,7 +1072,6 @@ def main():
                        "mode": mode,
                        "title": title if title else None,
                        "n_scenes": n_scenes,
                        "tts_workflow": tts_workflow_key,
                        "image_workflow": workflow_key,
                        "image_width": int(image_width),
                        "image_height": int(image_height),
@@ -989,14 +1081,20 @@ def main():
                        "progress_callback": update_progress,
                    }
                    # Add TTS parameters based on mode
                    video_params["tts_inference_mode"] = tts_mode
                    if tts_mode == "local":
                        video_params["tts_voice"] = selected_voice
                        video_params["tts_speed"] = tts_speed
                    else:  # comfyui
                        video_params["tts_workflow"] = tts_workflow_key
                        if ref_audio_path:
                            video_params["ref_audio"] = str(ref_audio_path)
                    # Add custom template parameters if any
                    if custom_values_for_video:
                        video_params["template_params"] = custom_values_for_video
                    # Add ref_audio if uploaded
                    if ref_audio_path:
                        video_params["ref_audio"] = str(ref_audio_path)
                    result = run_async(pixelle_video.generate_video(**video_params))
                    progress_bar.progress(100)
--- a/web/i18n/locales/en_US.json
+++ b/web/i18n/locales/en_US.json
@@ -179,6 +179,31 @@
    "settings.comfyui.runninghub_api_key": "RunningHub API Key",
    "settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key",
    "tts.inference_mode": "Synthesis Mode",
    "tts.mode.local": "Local Synthesis",
    "tts.mode.comfyui": "ComfyUI Synthesis",
    "tts.mode.local_hint": "💡 Using Edge TTS, no configuration required, ready to use",
    "tts.mode.comfyui_hint": "⚙️ Using ComfyUI workflows, flexible and powerful",
    "tts.voice_selector": "Voice Selection",
    "tts.speed": "Speed",
    "tts.speed_label": "{speed}x",
    "tts.voice.zh_CN_XiaoxiaoNeural": "zh-CN-XiaoxiaoNeural",
    "tts.voice.zh_CN_XiaoyiNeural": "zh-CN-XiaoyiNeural",
    "tts.voice.zh_CN_YunjianNeural": "zh-CN-YunjianNeural",
    "tts.voice.zh_CN_YunxiNeural": "zh-CN-YunxiNeural",
    "tts.voice.zh_CN_YunyangNeural": "zh-CN-YunyangNeural",
    "tts.voice.zh_CN_YunyeNeural": "zh-CN-YunyeNeural",
    "tts.voice.zh_CN_YunfengNeural": "zh-CN-YunfengNeural",
    "tts.voice.zh_CN_liaoning_XiaobeiNeural": "zh-CN-liaoning-XiaobeiNeural",
    "tts.voice.en_US_AriaNeural": "en-US-AriaNeural",
    "tts.voice.en_US_JennyNeural": "en-US-JennyNeural",
    "tts.voice.en_US_GuyNeural": "en-US-GuyNeural",
    "tts.voice.en_US_DavisNeural": "en-US-DavisNeural",
    "tts.voice.en_GB_SoniaNeural": "en-GB-SoniaNeural",
    "tts.voice.en_GB_RyanNeural": "en-GB-RyanNeural",
    "tts.selector": "Workflow Selection",
    "tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)",
    "tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder",
--- a/web/i18n/locales/zh_CN.json
+++ b/web/i18n/locales/zh_CN.json
@@ -179,6 +179,31 @@
    "settings.comfyui.runninghub_api_key": "RunningHub API 密钥",
    "settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key",
    "tts.inference_mode": "合成方式",
    "tts.mode.local": "本地合成",
    "tts.mode.comfyui": "ComfyUI 合成",
    "tts.mode.local_hint": "💡 使用 Edge TTS，无需配置，开箱即用（请确保网络环境可用）",
    "tts.mode.comfyui_hint": "⚙️ 使用 ComfyUI 工作流，灵活强大",
    "tts.voice_selector": "音色选择",
    "tts.speed": "语速",
    "tts.speed_label": "{speed}x",
    "tts.voice.zh_CN_XiaoxiaoNeural": "女声-温柔（晓晓）",
    "tts.voice.zh_CN_XiaoyiNeural": "女声-甜美（晓伊）",
    "tts.voice.zh_CN_YunjianNeural": "男声-专业（云健）",
    "tts.voice.zh_CN_YunxiNeural": "男声-磁性（云希）",
    "tts.voice.zh_CN_YunyangNeural": "男声-新闻（云扬）",
    "tts.voice.zh_CN_YunyeNeural": "男声-自然（云野）",
    "tts.voice.zh_CN_YunfengNeural": "男声-沉稳（云锋）",
    "tts.voice.zh_CN_liaoning_XiaobeiNeural": "女声-东北（小北）",
    "tts.voice.en_US_AriaNeural": "女声-自然（Aria）",
    "tts.voice.en_US_JennyNeural": "女声-温暖（Jenny）",
    "tts.voice.en_US_GuyNeural": "男声-标准（Guy）",
    "tts.voice.en_US_DavisNeural": "男声-友好（Davis）",
    "tts.voice.en_GB_SoniaNeural": "女声-英式（Sonia）",
    "tts.voice.en_GB_RyanNeural": "男声-英式（Ryan）",
    "tts.selector": "工作流选择",
    "tts.what": "将旁白文本转换为真人般的自然语音（部分工作流支持参考音频克隆声音）",
    "tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/（本地 ComfyUI）或 workflows/runninghub/（云端）文件夹",