From 393cdb8f0ae290995cd5c6b23ee8a72253182e97 Mon Sep 17 00:00:00 2001 From: puke <1129090915@qq.com> Date: Thu, 6 Nov 2025 21:06:14 +0800 Subject: [PATCH] =?UTF-8?q?tts=E6=94=AF=E6=8C=81=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=E5=90=88=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pixelle_video/config/schema.py | 21 ++- pixelle_video/models/storyboard.py | 9 +- pixelle_video/pipelines/standard.py | 41 ++++- pixelle_video/services/frame_processor.py | 25 ++- pixelle_video/services/tts_service.py | 144 ++++++++++----- pixelle_video/tts_voices.py | 147 +++++++++++++++ web/app.py | 206 ++++++++++++++++------ web/i18n/locales/en_US.json | 25 +++ web/i18n/locales/zh_CN.json | 25 +++ 9 files changed, 531 insertions(+), 112 deletions(-) create mode 100644 pixelle_video/tts_voices.py diff --git a/pixelle_video/config/schema.py b/pixelle_video/config/schema.py index 92bc79b..90423a1 100644 --- a/pixelle_video/config/schema.py +++ b/pixelle_video/config/schema.py @@ -14,9 +14,28 @@ class LLMConfig(BaseModel): model: str = Field(default="", description="LLM Model Name") +class TTSLocalConfig(BaseModel): + """Local TTS configuration (Edge TTS)""" + voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID") + speed: float = Field(default=1.2, ge=0.5, le=2.0, description="Speech speed multiplier (0.5-2.0)") + + +class TTSComfyUIConfig(BaseModel): + """ComfyUI TTS configuration""" + default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)") + + class TTSSubConfig(BaseModel): """TTS-specific configuration (under comfyui.tts)""" - default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)") + inference_mode: str = Field(default="local", description="TTS inference mode: 'local' or 'comfyui'") + local: TTSLocalConfig = Field(default_factory=TTSLocalConfig, description="Local TTS (Edge TTS) configuration") + comfyui: TTSComfyUIConfig = Field(default_factory=TTSComfyUIConfig, description="ComfyUI TTS configuration") + + # Backward compatibility: keep default_workflow at top level + @property + def default_workflow(self) -> Optional[str]: + """Get default workflow (for backward compatibility)""" + return self.comfyui.default_workflow class ImageSubConfig(BaseModel): diff --git a/pixelle_video/models/storyboard.py b/pixelle_video/models/storyboard.py index 8f541ad..1fd7f68 100644 --- a/pixelle_video/models/storyboard.py +++ b/pixelle_video/models/storyboard.py @@ -24,10 +24,11 @@ class StoryboardConfig: video_fps: int = 30 # Frame rate # Audio parameters - voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice - tts_workflow: Optional[str] = None # TTS workflow filename (None = use default) - tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster) - ref_audio: Optional[str] = None # Reference audio for voice cloning (only some workflows support this) + tts_inference_mode: str = "local" # TTS inference mode: "local" or "comfyui" + voice_id: Optional[str] = None # Voice ID (for local: Edge TTS voice ID; for comfyui: workflow-specific) + tts_workflow: Optional[str] = None # TTS workflow filename (for ComfyUI mode, None = use default) + tts_speed: Optional[float] = None # TTS speed multiplier (0.5-2.0, 1.0 = normal) + ref_audio: Optional[str] = None # Reference audio for voice cloning (ComfyUI mode only) # Image parameters image_width: int = 1024 diff --git a/pixelle_video/pipelines/standard.py b/pixelle_video/pipelines/standard.py index ca86944..3539194 100644 --- a/pixelle_video/pipelines/standard.py +++ b/pixelle_video/pipelines/standard.py @@ -62,10 +62,17 @@ class StandardPipeline(BasePipeline): # === Basic Config === n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode - voice_id: str = "[Chinese] zh-CN Yunjian", - tts_workflow: Optional[str] = None, - tts_speed: float = 1.2, - ref_audio: Optional[str] = None, # Reference audio for voice cloning + + # === TTS Parameters === + tts_inference_mode: Optional[str] = None, # "local" or "comfyui" + tts_voice: Optional[str] = None, # For local mode: Edge TTS voice ID + tts_speed: Optional[float] = None, # Speed multiplier (0.5-2.0) + tts_workflow: Optional[str] = None, # For ComfyUI mode: workflow path + ref_audio: Optional[str] = None, # For ComfyUI mode: reference audio + + # Deprecated (kept for backward compatibility) + voice_id: Optional[str] = None, + output_path: Optional[str] = None, # === LLM Parameters === @@ -191,6 +198,29 @@ class StandardPipeline(BasePipeline): output_path = get_task_final_video_path(task_id) logger.info(f" Will copy final video to: {user_specified_output}") + # Determine TTS inference mode and parameters + # Priority: explicit params > backward compatibility > config defaults + if tts_inference_mode is None: + # Check if user provided ComfyUI-specific params + if tts_workflow is not None or ref_audio is not None: + tts_inference_mode = "comfyui" + # Check if user provided old voice_id param (backward compatibility) + elif voice_id is not None: + tts_inference_mode = "comfyui" + if tts_voice is None: + tts_voice = voice_id + else: + # Use config default + tts_config = self.core.config.get("comfyui", {}).get("tts", {}) + tts_inference_mode = tts_config.get("inference_mode", "local") + + # Set voice_id based on mode for StoryboardConfig + final_voice_id = None + if tts_inference_mode == "local": + final_voice_id = tts_voice or voice_id + else: # comfyui + final_voice_id = voice_id # For ComfyUI, might be None + # Create storyboard config config = StoryboardConfig( task_id=task_id, @@ -200,7 +230,8 @@ class StandardPipeline(BasePipeline): min_image_prompt_words=min_image_prompt_words, max_image_prompt_words=max_image_prompt_words, video_fps=video_fps, - voice_id=voice_id, + tts_inference_mode=tts_inference_mode, + voice_id=final_voice_id, tts_workflow=tts_workflow, tts_speed=tts_speed, ref_audio=ref_audio, diff --git a/pixelle_video/services/frame_processor.py b/pixelle_video/services/frame_processor.py index 3472d4b..74d6f13 100644 --- a/pixelle_video/services/frame_processor.py +++ b/pixelle_video/services/frame_processor.py @@ -124,18 +124,29 @@ class FrameProcessor: from pixelle_video.utils.os_util import get_task_frame_path output_path = get_task_frame_path(config.task_id, frame.index, "audio") - # Call TTS with specific output path and workflow + # Build TTS params based on inference mode tts_params = { "text": frame.narration, - "workflow": config.tts_workflow, - "voice": config.voice_id, - "speed": config.tts_speed, + "inference_mode": config.tts_inference_mode, "output_path": output_path, } - # Add ref_audio if provided - if config.ref_audio: - tts_params["ref_audio"] = config.ref_audio + if config.tts_inference_mode == "local": + # Local mode: pass voice and speed + if config.voice_id: + tts_params["voice"] = config.voice_id + if config.tts_speed is not None: + tts_params["speed"] = config.tts_speed + else: # comfyui + # ComfyUI mode: pass workflow, voice, speed, and ref_audio + if config.tts_workflow: + tts_params["workflow"] = config.tts_workflow + if config.voice_id: + tts_params["voice"] = config.voice_id + if config.tts_speed is not None: + tts_params["speed"] = config.tts_speed + if config.ref_audio: + tts_params["ref_audio"] = config.ref_audio audio_path = await self.core.tts(**tts_params) diff --git a/pixelle_video/services/tts_service.py b/pixelle_video/services/tts_service.py index 97da7e6..cef60d3 100644 --- a/pixelle_video/services/tts_service.py +++ b/pixelle_video/services/tts_service.py @@ -1,13 +1,18 @@ """ -TTS (Text-to-Speech) Service - ComfyUI Workflow-based implementation +TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference """ +import os +import uuid +from pathlib import Path from typing import Optional from comfykit import ComfyKit from loguru import logger from pixelle_video.services.comfy_base_service import ComfyBaseService +from pixelle_video.utils.tts_util import edge_tts +from pixelle_video.tts_voices import speed_to_rate class TTSService(ComfyBaseService): @@ -52,22 +57,25 @@ class TTSService(ComfyBaseService): comfyui_url: Optional[str] = None, runninghub_api_key: Optional[str] = None, # TTS parameters - voice: str = "[Chinese] zh-CN Yunjian", - speed: float = 1.2, + voice: Optional[str] = None, + speed: Optional[float] = None, + # Inference mode override + inference_mode: Optional[str] = None, # Output path output_path: Optional[str] = None, **params ) -> str: """ - Generate speech using ComfyUI workflow + Generate speech using local Edge TTS or ComfyUI workflow Args: text: Text to convert to speech - workflow: Workflow filename (default: from config) + workflow: Workflow filename (for ComfyUI mode, default: from config) comfyui_url: ComfyUI URL (optional, overrides config) runninghub_api_key: RunningHub API key (optional, overrides config) - voice: Voice ID (workflow-specific) + voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific) speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower) + inference_mode: Override inference mode ("local" or "comfyui", default: from config) output_path: Custom output path (auto-generated if None) **params: Additional workflow parameters @@ -75,49 +83,103 @@ class TTSService(ComfyBaseService): Generated audio file path Examples: - # Simplest: use default workflow - audio_path = await pixelle_video.tts(text="Hello, world!") - - # Use specific workflow + # Local inference (Edge TTS) audio_path = await pixelle_video.tts( - text="你好,世界!", - workflow="tts_edge.json" - ) - - # With voice and speed - audio_path = await pixelle_video.tts( - text="Hello", - workflow="tts_edge.json", - voice="[Chinese] zh-CN Xiaoxiao", + text="Hello, world!", + inference_mode="local", + voice="zh-CN-YunjianNeural", speed=1.2 ) - # With absolute path + # ComfyUI inference audio_path = await pixelle_video.tts( - text="Hello", - workflow="/path/to/custom_tts.json" - ) - - # With custom ComfyUI server - audio_path = await pixelle_video.tts( - text="Hello", - comfyui_url="http://192.168.1.100:8188" + text="你好,世界!", + inference_mode="comfyui", + workflow="runninghub/tts_edge.json" ) """ - # 1. Resolve workflow (returns structured info) - workflow_info = self._resolve_workflow(workflow=workflow) + # Determine inference mode (param > config) + mode = inference_mode or self.config.get("inference_mode", "local") - # 2. Execute ComfyUI workflow - return await self._call_comfyui_workflow( - workflow_info=workflow_info, - text=text, - comfyui_url=comfyui_url, - runninghub_api_key=runninghub_api_key, - voice=voice, - speed=speed, - output_path=output_path, - **params - ) + # Route to appropriate implementation + if mode == "local": + return await self._call_local_tts( + text=text, + voice=voice, + speed=speed, + output_path=output_path + ) + else: # comfyui + # 1. Resolve workflow (returns structured info) + workflow_info = self._resolve_workflow(workflow=workflow) + + # 2. Execute ComfyUI workflow + return await self._call_comfyui_workflow( + workflow_info=workflow_info, + text=text, + comfyui_url=comfyui_url, + runninghub_api_key=runninghub_api_key, + voice=voice, + speed=speed, + output_path=output_path, + **params + ) + + async def _call_local_tts( + self, + text: str, + voice: Optional[str] = None, + speed: Optional[float] = None, + output_path: Optional[str] = None, + ) -> str: + """ + Generate speech using local Edge TTS + + Args: + text: Text to convert to speech + voice: Edge TTS voice ID (default: from config) + speed: Speech speed multiplier (default: from config) + output_path: Custom output path (auto-generated if None) + + Returns: + Generated audio file path + """ + # Get config defaults + local_config = self.config.get("local", {}) + + # Determine voice and speed (param > config) + final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural") + final_speed = speed if speed is not None else local_config.get("speed", 1.2) + + # Convert speed to rate parameter + rate = speed_to_rate(final_speed) + + logger.info(f"🎙️ Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})") + + # Generate output path if not provided + if not output_path: + # Generate unique filename + unique_id = uuid.uuid4().hex + output_path = f"output/{unique_id}.mp3" + + # Ensure output directory exists + Path("output").mkdir(parents=True, exist_ok=True) + + # Call Edge TTS + try: + audio_bytes = await edge_tts( + text=text, + voice=final_voice, + rate=rate, + output_path=output_path + ) + + logger.info(f"✅ Generated audio (local Edge TTS): {output_path}") + return output_path + + except Exception as e: + logger.error(f"Local TTS generation error: {e}") + raise async def _call_comfyui_workflow( self, diff --git a/pixelle_video/tts_voices.py b/pixelle_video/tts_voices.py new file mode 100644 index 0000000..b317859 --- /dev/null +++ b/pixelle_video/tts_voices.py @@ -0,0 +1,147 @@ +""" +TTS Voice Configuration + +Defines available voices for local Edge TTS inference. +""" + +from typing import List, Dict, Any + + +# Edge TTS voice presets for local inference +EDGE_TTS_VOICES: List[Dict[str, Any]] = [ + # Chinese voices + { + "id": "zh-CN-XiaoxiaoNeural", + "label_key": "tts.voice.zh_CN_XiaoxiaoNeural", + "locale": "zh-CN", + "gender": "female" + }, + { + "id": "zh-CN-XiaoyiNeural", + "label_key": "tts.voice.zh_CN_XiaoyiNeural", + "locale": "zh-CN", + "gender": "female" + }, + { + "id": "zh-CN-YunjianNeural", + "label_key": "tts.voice.zh_CN_YunjianNeural", + "locale": "zh-CN", + "gender": "male" + }, + { + "id": "zh-CN-YunxiNeural", + "label_key": "tts.voice.zh_CN_YunxiNeural", + "locale": "zh-CN", + "gender": "male" + }, + { + "id": "zh-CN-YunyangNeural", + "label_key": "tts.voice.zh_CN_YunyangNeural", + "locale": "zh-CN", + "gender": "male" + }, + { + "id": "zh-CN-YunyeNeural", + "label_key": "tts.voice.zh_CN_YunyeNeural", + "locale": "zh-CN", + "gender": "male" + }, + { + "id": "zh-CN-YunfengNeural", + "label_key": "tts.voice.zh_CN_YunfengNeural", + "locale": "zh-CN", + "gender": "male" + }, + { + "id": "zh-CN-liaoning-XiaobeiNeural", + "label_key": "tts.voice.zh_CN_liaoning_XiaobeiNeural", + "locale": "zh-CN", + "gender": "female" + }, + + # English voices + { + "id": "en-US-AriaNeural", + "label_key": "tts.voice.en_US_AriaNeural", + "locale": "en-US", + "gender": "female" + }, + { + "id": "en-US-JennyNeural", + "label_key": "tts.voice.en_US_JennyNeural", + "locale": "en-US", + "gender": "female" + }, + { + "id": "en-US-GuyNeural", + "label_key": "tts.voice.en_US_GuyNeural", + "locale": "en-US", + "gender": "male" + }, + { + "id": "en-US-DavisNeural", + "label_key": "tts.voice.en_US_DavisNeural", + "locale": "en-US", + "gender": "male" + }, + { + "id": "en-GB-SoniaNeural", + "label_key": "tts.voice.en_GB_SoniaNeural", + "locale": "en-GB", + "gender": "female" + }, + { + "id": "en-GB-RyanNeural", + "label_key": "tts.voice.en_GB_RyanNeural", + "locale": "en-GB", + "gender": "male" + }, +] + + +def get_voice_display_name(voice_id: str, tr_func=None, locale: str = "zh_CN") -> str: + """ + Get display name for voice + + Args: + voice_id: Voice ID (e.g., "zh-CN-YunjianNeural") + tr_func: Translation function (optional) + locale: Current locale (default: "zh_CN") + + Returns: + Display name (translated label if in Chinese, otherwise voice ID) + """ + # Find voice config + voice_config = next((v for v in EDGE_TTS_VOICES if v["id"] == voice_id), None) + + if not voice_config: + return voice_id + + # If Chinese locale and translation function available, use translated label + if locale == "zh_CN" and tr_func: + label_key = voice_config["label_key"] + return tr_func(label_key) + + # For other locales, return voice ID + return voice_id + + +def speed_to_rate(speed: float) -> str: + """ + Convert speed multiplier to Edge TTS rate parameter + + Args: + speed: Speed multiplier (1.0 = normal, 1.2 = 120%) + + Returns: + Rate string (e.g., "+20%", "-10%") + + Examples: + 1.0 → "+0%" + 1.2 → "+20%" + 0.8 → "-20%" + """ + percentage = int((speed - 1.0) * 100) + sign = "+" if percentage >= 0 else "" + return f"{sign}{percentage}%" + diff --git a/web/app.py b/web/app.py index 3d39b71..eea3c20 100644 --- a/web/app.py +++ b/web/app.py @@ -449,58 +449,146 @@ def main(): st.markdown(f"**{tr('help.how')}**") st.markdown(tr("tts.how")) - # Get available TTS workflows - tts_workflows = pixelle_video.tts.list_workflows() - - # Build options for selectbox - tts_workflow_options = [wf["display_name"] for wf in tts_workflows] - tts_workflow_keys = [wf["key"] for wf in tts_workflows] - - # Default to saved workflow if exists - default_tts_index = 0 + # Get TTS config comfyui_config = config_manager.get_comfyui_config() - saved_tts_workflow = comfyui_config["tts"]["default_workflow"] - if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys: - default_tts_index = tts_workflow_keys.index(saved_tts_workflow) + tts_config = comfyui_config["tts"] - tts_workflow_display = st.selectbox( - "TTS Workflow", - tts_workflow_options if tts_workflow_options else ["No TTS workflows found"], - index=default_tts_index, - label_visibility="collapsed", - key="tts_workflow_select" + # Inference mode selection + tts_mode = st.radio( + tr("tts.inference_mode"), + ["local", "comfyui"], + horizontal=True, + format_func=lambda x: tr(f"tts.mode.{x}"), + index=0 if tts_config.get("inference_mode", "local") == "local" else 1, + key="tts_inference_mode" ) - # Get the actual workflow key - if tts_workflow_options: - tts_selected_index = tts_workflow_options.index(tts_workflow_display) - tts_workflow_key = tts_workflow_keys[tts_selected_index] + # Show hint based on mode + if tts_mode == "local": + st.caption(tr("tts.mode.local_hint")) else: - tts_workflow_key = "selfhost/tts_edge.json" # fallback + st.caption(tr("tts.mode.comfyui_hint")) - # Reference audio upload (optional, for voice cloning) - ref_audio_file = st.file_uploader( - tr("tts.ref_audio"), - type=["mp3", "wav", "flac", "m4a", "aac", "ogg"], - help=tr("tts.ref_audio_help"), - key="ref_audio_upload" - ) - - # Save uploaded ref_audio to temp file if provided - ref_audio_path = None - if ref_audio_file is not None: - # Audio preview player (directly play uploaded file) - st.audio(ref_audio_file) + # ================================================================ + # Local Mode UI + # ================================================================ + if tts_mode == "local": + # Import voice configuration + from pixelle_video.tts_voices import EDGE_TTS_VOICES, get_voice_display_name - # Save to temp directory - import tempfile - temp_dir = Path("temp") - temp_dir.mkdir(exist_ok=True) - ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}" - with open(ref_audio_path, "wb") as f: - f.write(ref_audio_file.getbuffer()) + # Get saved voice from config + local_config = tts_config.get("local", {}) + saved_voice = local_config.get("voice", "zh-CN-YunjianNeural") + saved_speed = local_config.get("speed", 1.2) + + # Build voice options with i18n + voice_options = [] + voice_ids = [] + default_voice_index = 0 + + for idx, voice_config in enumerate(EDGE_TTS_VOICES): + voice_id = voice_config["id"] + display_name = get_voice_display_name(voice_id, tr, get_language()) + voice_options.append(display_name) + voice_ids.append(voice_id) + + # Set default index if matches saved voice + if voice_id == saved_voice: + default_voice_index = idx + + # Two-column layout: Voice | Speed + voice_col, speed_col = st.columns([1, 1]) + + with voice_col: + # Voice selector + selected_voice_display = st.selectbox( + tr("tts.voice_selector"), + voice_options, + index=default_voice_index, + key="tts_local_voice" + ) + + # Get actual voice ID + selected_voice_index = voice_options.index(selected_voice_display) + selected_voice = voice_ids[selected_voice_index] + + with speed_col: + # Speed slider + tts_speed = st.slider( + tr("tts.speed"), + min_value=0.5, + max_value=2.0, + value=saved_speed, + step=0.1, + format="%.1fx", + key="tts_local_speed" + ) + st.caption(tr("tts.speed_label", speed=f"{tts_speed:.1f}")) + + # Variables for video generation + tts_workflow_key = None + ref_audio_path = None - # TTS preview expander (simplified, uses default voice and speed) + # ================================================================ + # ComfyUI Mode UI + # ================================================================ + else: # comfyui mode + # Get available TTS workflows + tts_workflows = pixelle_video.tts.list_workflows() + + # Build options for selectbox + tts_workflow_options = [wf["display_name"] for wf in tts_workflows] + tts_workflow_keys = [wf["key"] for wf in tts_workflows] + + # Default to saved workflow if exists + default_tts_index = 0 + saved_tts_workflow = tts_config.get("comfyui", {}).get("default_workflow") + if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys: + default_tts_index = tts_workflow_keys.index(saved_tts_workflow) + + tts_workflow_display = st.selectbox( + "TTS Workflow", + tts_workflow_options if tts_workflow_options else ["No TTS workflows found"], + index=default_tts_index, + label_visibility="collapsed", + key="tts_workflow_select" + ) + + # Get the actual workflow key + if tts_workflow_options: + tts_selected_index = tts_workflow_options.index(tts_workflow_display) + tts_workflow_key = tts_workflow_keys[tts_selected_index] + else: + tts_workflow_key = "selfhost/tts_edge.json" # fallback + + # Reference audio upload (optional, for voice cloning) + ref_audio_file = st.file_uploader( + tr("tts.ref_audio"), + type=["mp3", "wav", "flac", "m4a", "aac", "ogg"], + help=tr("tts.ref_audio_help"), + key="ref_audio_upload" + ) + + # Save uploaded ref_audio to temp file if provided + ref_audio_path = None + if ref_audio_file is not None: + # Audio preview player (directly play uploaded file) + st.audio(ref_audio_file) + + # Save to temp directory + temp_dir = Path("temp") + temp_dir.mkdir(exist_ok=True) + ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}" + with open(ref_audio_path, "wb") as f: + f.write(ref_audio_file.getbuffer()) + + # Variables for video generation + selected_voice = None + tts_speed = None + + # ================================================================ + # TTS Preview (works for both modes) + # ================================================================ with st.expander(tr("tts.preview_title"), expanded=False): # Preview text input preview_text = st.text_input( @@ -514,14 +602,19 @@ def main(): if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True): with st.spinner(tr("tts.previewing")): try: - # Generate preview audio using selected workflow (use default voice and speed) - # Pass ref_audio if uploaded + # Build TTS params based on mode tts_params = { "text": preview_text, - "workflow": tts_workflow_key + "inference_mode": tts_mode } - if ref_audio_path: - tts_params["ref_audio"] = str(ref_audio_path) + + if tts_mode == "local": + tts_params["voice"] = selected_voice + tts_params["speed"] = tts_speed + else: # comfyui + tts_params["workflow"] = tts_workflow_key + if ref_audio_path: + tts_params["ref_audio"] = str(ref_audio_path) audio_path = run_async(pixelle_video.tts(**tts_params)) @@ -979,7 +1072,6 @@ def main(): "mode": mode, "title": title if title else None, "n_scenes": n_scenes, - "tts_workflow": tts_workflow_key, "image_workflow": workflow_key, "image_width": int(image_width), "image_height": int(image_height), @@ -989,14 +1081,20 @@ def main(): "progress_callback": update_progress, } + # Add TTS parameters based on mode + video_params["tts_inference_mode"] = tts_mode + if tts_mode == "local": + video_params["tts_voice"] = selected_voice + video_params["tts_speed"] = tts_speed + else: # comfyui + video_params["tts_workflow"] = tts_workflow_key + if ref_audio_path: + video_params["ref_audio"] = str(ref_audio_path) + # Add custom template parameters if any if custom_values_for_video: video_params["template_params"] = custom_values_for_video - # Add ref_audio if uploaded - if ref_audio_path: - video_params["ref_audio"] = str(ref_audio_path) - result = run_async(pixelle_video.generate_video(**video_params)) progress_bar.progress(100) diff --git a/web/i18n/locales/en_US.json b/web/i18n/locales/en_US.json index 8dad7bb..b7c2b06 100644 --- a/web/i18n/locales/en_US.json +++ b/web/i18n/locales/en_US.json @@ -179,6 +179,31 @@ "settings.comfyui.runninghub_api_key": "RunningHub API Key", "settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key", + "tts.inference_mode": "Synthesis Mode", + "tts.mode.local": "Local Synthesis", + "tts.mode.comfyui": "ComfyUI Synthesis", + "tts.mode.local_hint": "💡 Using Edge TTS, no configuration required, ready to use", + "tts.mode.comfyui_hint": "⚙️ Using ComfyUI workflows, flexible and powerful", + + "tts.voice_selector": "Voice Selection", + "tts.speed": "Speed", + "tts.speed_label": "{speed}x", + + "tts.voice.zh_CN_XiaoxiaoNeural": "zh-CN-XiaoxiaoNeural", + "tts.voice.zh_CN_XiaoyiNeural": "zh-CN-XiaoyiNeural", + "tts.voice.zh_CN_YunjianNeural": "zh-CN-YunjianNeural", + "tts.voice.zh_CN_YunxiNeural": "zh-CN-YunxiNeural", + "tts.voice.zh_CN_YunyangNeural": "zh-CN-YunyangNeural", + "tts.voice.zh_CN_YunyeNeural": "zh-CN-YunyeNeural", + "tts.voice.zh_CN_YunfengNeural": "zh-CN-YunfengNeural", + "tts.voice.zh_CN_liaoning_XiaobeiNeural": "zh-CN-liaoning-XiaobeiNeural", + "tts.voice.en_US_AriaNeural": "en-US-AriaNeural", + "tts.voice.en_US_JennyNeural": "en-US-JennyNeural", + "tts.voice.en_US_GuyNeural": "en-US-GuyNeural", + "tts.voice.en_US_DavisNeural": "en-US-DavisNeural", + "tts.voice.en_GB_SoniaNeural": "en-GB-SoniaNeural", + "tts.voice.en_GB_RyanNeural": "en-GB-RyanNeural", + "tts.selector": "Workflow Selection", "tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)", "tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder", diff --git a/web/i18n/locales/zh_CN.json b/web/i18n/locales/zh_CN.json index 2b37f57..187c377 100644 --- a/web/i18n/locales/zh_CN.json +++ b/web/i18n/locales/zh_CN.json @@ -179,6 +179,31 @@ "settings.comfyui.runninghub_api_key": "RunningHub API 密钥", "settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key", + "tts.inference_mode": "合成方式", + "tts.mode.local": "本地合成", + "tts.mode.comfyui": "ComfyUI 合成", + "tts.mode.local_hint": "💡 使用 Edge TTS,无需配置,开箱即用(请确保网络环境可用)", + "tts.mode.comfyui_hint": "⚙️ 使用 ComfyUI 工作流,灵活强大", + + "tts.voice_selector": "音色选择", + "tts.speed": "语速", + "tts.speed_label": "{speed}x", + + "tts.voice.zh_CN_XiaoxiaoNeural": "女声-温柔(晓晓)", + "tts.voice.zh_CN_XiaoyiNeural": "女声-甜美(晓伊)", + "tts.voice.zh_CN_YunjianNeural": "男声-专业(云健)", + "tts.voice.zh_CN_YunxiNeural": "男声-磁性(云希)", + "tts.voice.zh_CN_YunyangNeural": "男声-新闻(云扬)", + "tts.voice.zh_CN_YunyeNeural": "男声-自然(云野)", + "tts.voice.zh_CN_YunfengNeural": "男声-沉稳(云锋)", + "tts.voice.zh_CN_liaoning_XiaobeiNeural": "女声-东北(小北)", + "tts.voice.en_US_AriaNeural": "女声-自然(Aria)", + "tts.voice.en_US_JennyNeural": "女声-温暖(Jenny)", + "tts.voice.en_US_GuyNeural": "男声-标准(Guy)", + "tts.voice.en_US_DavisNeural": "男声-友好(Davis)", + "tts.voice.en_GB_SoniaNeural": "女声-英式(Sonia)", + "tts.voice.en_GB_RyanNeural": "男声-英式(Ryan)", + "tts.selector": "工作流选择", "tts.what": "将旁白文本转换为真人般的自然语音(部分工作流支持参考音频克隆声音)", "tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/(本地 ComfyUI)或 workflows/runninghub/(云端)文件夹",