tts支持本地合成

This commit is contained in:
puke
2025-11-06 21:06:14 +08:00
parent 56b6b74af7
commit 393cdb8f0a
9 changed files with 531 additions and 112 deletions

View File

@@ -14,9 +14,28 @@ class LLMConfig(BaseModel):
model: str = Field(default="", description="LLM Model Name") model: str = Field(default="", description="LLM Model Name")
class TTSLocalConfig(BaseModel):
"""Local TTS configuration (Edge TTS)"""
voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID")
speed: float = Field(default=1.2, ge=0.5, le=2.0, description="Speech speed multiplier (0.5-2.0)")
class TTSComfyUIConfig(BaseModel):
"""ComfyUI TTS configuration"""
default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
class TTSSubConfig(BaseModel): class TTSSubConfig(BaseModel):
"""TTS-specific configuration (under comfyui.tts)""" """TTS-specific configuration (under comfyui.tts)"""
default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)") inference_mode: str = Field(default="local", description="TTS inference mode: 'local' or 'comfyui'")
local: TTSLocalConfig = Field(default_factory=TTSLocalConfig, description="Local TTS (Edge TTS) configuration")
comfyui: TTSComfyUIConfig = Field(default_factory=TTSComfyUIConfig, description="ComfyUI TTS configuration")
# Backward compatibility: keep default_workflow at top level
@property
def default_workflow(self) -> Optional[str]:
"""Get default workflow (for backward compatibility)"""
return self.comfyui.default_workflow
class ImageSubConfig(BaseModel): class ImageSubConfig(BaseModel):

View File

@@ -24,10 +24,11 @@ class StoryboardConfig:
video_fps: int = 30 # Frame rate video_fps: int = 30 # Frame rate
# Audio parameters # Audio parameters
voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice tts_inference_mode: str = "local" # TTS inference mode: "local" or "comfyui"
tts_workflow: Optional[str] = None # TTS workflow filename (None = use default) voice_id: Optional[str] = None # Voice ID (for local: Edge TTS voice ID; for comfyui: workflow-specific)
tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster) tts_workflow: Optional[str] = None # TTS workflow filename (for ComfyUI mode, None = use default)
ref_audio: Optional[str] = None # Reference audio for voice cloning (only some workflows support this) tts_speed: Optional[float] = None # TTS speed multiplier (0.5-2.0, 1.0 = normal)
ref_audio: Optional[str] = None # Reference audio for voice cloning (ComfyUI mode only)
# Image parameters # Image parameters
image_width: int = 1024 image_width: int = 1024

View File

@@ -62,10 +62,17 @@ class StandardPipeline(BasePipeline):
# === Basic Config === # === Basic Config ===
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
voice_id: str = "[Chinese] zh-CN Yunjian",
tts_workflow: Optional[str] = None, # === TTS Parameters ===
tts_speed: float = 1.2, tts_inference_mode: Optional[str] = None, # "local" or "comfyui"
ref_audio: Optional[str] = None, # Reference audio for voice cloning tts_voice: Optional[str] = None, # For local mode: Edge TTS voice ID
tts_speed: Optional[float] = None, # Speed multiplier (0.5-2.0)
tts_workflow: Optional[str] = None, # For ComfyUI mode: workflow path
ref_audio: Optional[str] = None, # For ComfyUI mode: reference audio
# Deprecated (kept for backward compatibility)
voice_id: Optional[str] = None,
output_path: Optional[str] = None, output_path: Optional[str] = None,
# === LLM Parameters === # === LLM Parameters ===
@@ -191,6 +198,29 @@ class StandardPipeline(BasePipeline):
output_path = get_task_final_video_path(task_id) output_path = get_task_final_video_path(task_id)
logger.info(f" Will copy final video to: {user_specified_output}") logger.info(f" Will copy final video to: {user_specified_output}")
# Determine TTS inference mode and parameters
# Priority: explicit params > backward compatibility > config defaults
if tts_inference_mode is None:
# Check if user provided ComfyUI-specific params
if tts_workflow is not None or ref_audio is not None:
tts_inference_mode = "comfyui"
# Check if user provided old voice_id param (backward compatibility)
elif voice_id is not None:
tts_inference_mode = "comfyui"
if tts_voice is None:
tts_voice = voice_id
else:
# Use config default
tts_config = self.core.config.get("comfyui", {}).get("tts", {})
tts_inference_mode = tts_config.get("inference_mode", "local")
# Set voice_id based on mode for StoryboardConfig
final_voice_id = None
if tts_inference_mode == "local":
final_voice_id = tts_voice or voice_id
else: # comfyui
final_voice_id = voice_id # For ComfyUI, might be None
# Create storyboard config # Create storyboard config
config = StoryboardConfig( config = StoryboardConfig(
task_id=task_id, task_id=task_id,
@@ -200,7 +230,8 @@ class StandardPipeline(BasePipeline):
min_image_prompt_words=min_image_prompt_words, min_image_prompt_words=min_image_prompt_words,
max_image_prompt_words=max_image_prompt_words, max_image_prompt_words=max_image_prompt_words,
video_fps=video_fps, video_fps=video_fps,
voice_id=voice_id, tts_inference_mode=tts_inference_mode,
voice_id=final_voice_id,
tts_workflow=tts_workflow, tts_workflow=tts_workflow,
tts_speed=tts_speed, tts_speed=tts_speed,
ref_audio=ref_audio, ref_audio=ref_audio,

View File

@@ -124,18 +124,29 @@ class FrameProcessor:
from pixelle_video.utils.os_util import get_task_frame_path from pixelle_video.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(config.task_id, frame.index, "audio") output_path = get_task_frame_path(config.task_id, frame.index, "audio")
# Call TTS with specific output path and workflow # Build TTS params based on inference mode
tts_params = { tts_params = {
"text": frame.narration, "text": frame.narration,
"workflow": config.tts_workflow, "inference_mode": config.tts_inference_mode,
"voice": config.voice_id,
"speed": config.tts_speed,
"output_path": output_path, "output_path": output_path,
} }
# Add ref_audio if provided if config.tts_inference_mode == "local":
if config.ref_audio: # Local mode: pass voice and speed
tts_params["ref_audio"] = config.ref_audio if config.voice_id:
tts_params["voice"] = config.voice_id
if config.tts_speed is not None:
tts_params["speed"] = config.tts_speed
else: # comfyui
# ComfyUI mode: pass workflow, voice, speed, and ref_audio
if config.tts_workflow:
tts_params["workflow"] = config.tts_workflow
if config.voice_id:
tts_params["voice"] = config.voice_id
if config.tts_speed is not None:
tts_params["speed"] = config.tts_speed
if config.ref_audio:
tts_params["ref_audio"] = config.ref_audio
audio_path = await self.core.tts(**tts_params) audio_path = await self.core.tts(**tts_params)

View File

@@ -1,13 +1,18 @@
""" """
TTS (Text-to-Speech) Service - ComfyUI Workflow-based implementation TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
""" """
import os
import uuid
from pathlib import Path
from typing import Optional from typing import Optional
from comfykit import ComfyKit from comfykit import ComfyKit
from loguru import logger from loguru import logger
from pixelle_video.services.comfy_base_service import ComfyBaseService from pixelle_video.services.comfy_base_service import ComfyBaseService
from pixelle_video.utils.tts_util import edge_tts
from pixelle_video.tts_voices import speed_to_rate
class TTSService(ComfyBaseService): class TTSService(ComfyBaseService):
@@ -52,22 +57,25 @@ class TTSService(ComfyBaseService):
comfyui_url: Optional[str] = None, comfyui_url: Optional[str] = None,
runninghub_api_key: Optional[str] = None, runninghub_api_key: Optional[str] = None,
# TTS parameters # TTS parameters
voice: str = "[Chinese] zh-CN Yunjian", voice: Optional[str] = None,
speed: float = 1.2, speed: Optional[float] = None,
# Inference mode override
inference_mode: Optional[str] = None,
# Output path # Output path
output_path: Optional[str] = None, output_path: Optional[str] = None,
**params **params
) -> str: ) -> str:
""" """
Generate speech using ComfyUI workflow Generate speech using local Edge TTS or ComfyUI workflow
Args: Args:
text: Text to convert to speech text: Text to convert to speech
workflow: Workflow filename (default: from config) workflow: Workflow filename (for ComfyUI mode, default: from config)
comfyui_url: ComfyUI URL (optional, overrides config) comfyui_url: ComfyUI URL (optional, overrides config)
runninghub_api_key: RunningHub API key (optional, overrides config) runninghub_api_key: RunningHub API key (optional, overrides config)
voice: Voice ID (workflow-specific) voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower) speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
inference_mode: Override inference mode ("local" or "comfyui", default: from config)
output_path: Custom output path (auto-generated if None) output_path: Custom output path (auto-generated if None)
**params: Additional workflow parameters **params: Additional workflow parameters
@@ -75,49 +83,103 @@ class TTSService(ComfyBaseService):
Generated audio file path Generated audio file path
Examples: Examples:
# Simplest: use default workflow # Local inference (Edge TTS)
audio_path = await pixelle_video.tts(text="Hello, world!")
# Use specific workflow
audio_path = await pixelle_video.tts( audio_path = await pixelle_video.tts(
text="你好,世界!", text="Hello, world!",
workflow="tts_edge.json" inference_mode="local",
) voice="zh-CN-YunjianNeural",
# With voice and speed
audio_path = await pixelle_video.tts(
text="Hello",
workflow="tts_edge.json",
voice="[Chinese] zh-CN Xiaoxiao",
speed=1.2 speed=1.2
) )
# With absolute path # ComfyUI inference
audio_path = await pixelle_video.tts( audio_path = await pixelle_video.tts(
text="Hello", text="你好,世界!",
workflow="/path/to/custom_tts.json" inference_mode="comfyui",
) workflow="runninghub/tts_edge.json"
# With custom ComfyUI server
audio_path = await pixelle_video.tts(
text="Hello",
comfyui_url="http://192.168.1.100:8188"
) )
""" """
# 1. Resolve workflow (returns structured info) # Determine inference mode (param > config)
workflow_info = self._resolve_workflow(workflow=workflow) mode = inference_mode or self.config.get("inference_mode", "local")
# 2. Execute ComfyUI workflow # Route to appropriate implementation
return await self._call_comfyui_workflow( if mode == "local":
workflow_info=workflow_info, return await self._call_local_tts(
text=text, text=text,
comfyui_url=comfyui_url, voice=voice,
runninghub_api_key=runninghub_api_key, speed=speed,
voice=voice, output_path=output_path
speed=speed, )
output_path=output_path, else: # comfyui
**params # 1. Resolve workflow (returns structured info)
) workflow_info = self._resolve_workflow(workflow=workflow)
# 2. Execute ComfyUI workflow
return await self._call_comfyui_workflow(
workflow_info=workflow_info,
text=text,
comfyui_url=comfyui_url,
runninghub_api_key=runninghub_api_key,
voice=voice,
speed=speed,
output_path=output_path,
**params
)
async def _call_local_tts(
self,
text: str,
voice: Optional[str] = None,
speed: Optional[float] = None,
output_path: Optional[str] = None,
) -> str:
"""
Generate speech using local Edge TTS
Args:
text: Text to convert to speech
voice: Edge TTS voice ID (default: from config)
speed: Speech speed multiplier (default: from config)
output_path: Custom output path (auto-generated if None)
Returns:
Generated audio file path
"""
# Get config defaults
local_config = self.config.get("local", {})
# Determine voice and speed (param > config)
final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
final_speed = speed if speed is not None else local_config.get("speed", 1.2)
# Convert speed to rate parameter
rate = speed_to_rate(final_speed)
logger.info(f"🎙️ Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
# Generate output path if not provided
if not output_path:
# Generate unique filename
unique_id = uuid.uuid4().hex
output_path = f"output/{unique_id}.mp3"
# Ensure output directory exists
Path("output").mkdir(parents=True, exist_ok=True)
# Call Edge TTS
try:
audio_bytes = await edge_tts(
text=text,
voice=final_voice,
rate=rate,
output_path=output_path
)
logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
return output_path
except Exception as e:
logger.error(f"Local TTS generation error: {e}")
raise
async def _call_comfyui_workflow( async def _call_comfyui_workflow(
self, self,

147
pixelle_video/tts_voices.py Normal file
View File

@@ -0,0 +1,147 @@
"""
TTS Voice Configuration
Defines available voices for local Edge TTS inference.
"""
from typing import List, Dict, Any
# Edge TTS voice presets for local inference
EDGE_TTS_VOICES: List[Dict[str, Any]] = [
# Chinese voices
{
"id": "zh-CN-XiaoxiaoNeural",
"label_key": "tts.voice.zh_CN_XiaoxiaoNeural",
"locale": "zh-CN",
"gender": "female"
},
{
"id": "zh-CN-XiaoyiNeural",
"label_key": "tts.voice.zh_CN_XiaoyiNeural",
"locale": "zh-CN",
"gender": "female"
},
{
"id": "zh-CN-YunjianNeural",
"label_key": "tts.voice.zh_CN_YunjianNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunxiNeural",
"label_key": "tts.voice.zh_CN_YunxiNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunyangNeural",
"label_key": "tts.voice.zh_CN_YunyangNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunyeNeural",
"label_key": "tts.voice.zh_CN_YunyeNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunfengNeural",
"label_key": "tts.voice.zh_CN_YunfengNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-liaoning-XiaobeiNeural",
"label_key": "tts.voice.zh_CN_liaoning_XiaobeiNeural",
"locale": "zh-CN",
"gender": "female"
},
# English voices
{
"id": "en-US-AriaNeural",
"label_key": "tts.voice.en_US_AriaNeural",
"locale": "en-US",
"gender": "female"
},
{
"id": "en-US-JennyNeural",
"label_key": "tts.voice.en_US_JennyNeural",
"locale": "en-US",
"gender": "female"
},
{
"id": "en-US-GuyNeural",
"label_key": "tts.voice.en_US_GuyNeural",
"locale": "en-US",
"gender": "male"
},
{
"id": "en-US-DavisNeural",
"label_key": "tts.voice.en_US_DavisNeural",
"locale": "en-US",
"gender": "male"
},
{
"id": "en-GB-SoniaNeural",
"label_key": "tts.voice.en_GB_SoniaNeural",
"locale": "en-GB",
"gender": "female"
},
{
"id": "en-GB-RyanNeural",
"label_key": "tts.voice.en_GB_RyanNeural",
"locale": "en-GB",
"gender": "male"
},
]
def get_voice_display_name(voice_id: str, tr_func=None, locale: str = "zh_CN") -> str:
"""
Get display name for voice
Args:
voice_id: Voice ID (e.g., "zh-CN-YunjianNeural")
tr_func: Translation function (optional)
locale: Current locale (default: "zh_CN")
Returns:
Display name (translated label if in Chinese, otherwise voice ID)
"""
# Find voice config
voice_config = next((v for v in EDGE_TTS_VOICES if v["id"] == voice_id), None)
if not voice_config:
return voice_id
# If Chinese locale and translation function available, use translated label
if locale == "zh_CN" and tr_func:
label_key = voice_config["label_key"]
return tr_func(label_key)
# For other locales, return voice ID
return voice_id
def speed_to_rate(speed: float) -> str:
"""
Convert speed multiplier to Edge TTS rate parameter
Args:
speed: Speed multiplier (1.0 = normal, 1.2 = 120%)
Returns:
Rate string (e.g., "+20%", "-10%")
Examples:
1.0 → "+0%"
1.2 → "+20%"
0.8 → "-20%"
"""
percentage = int((speed - 1.0) * 100)
sign = "+" if percentage >= 0 else ""
return f"{sign}{percentage}%"

View File

@@ -449,58 +449,146 @@ def main():
st.markdown(f"**{tr('help.how')}**") st.markdown(f"**{tr('help.how')}**")
st.markdown(tr("tts.how")) st.markdown(tr("tts.how"))
# Get available TTS workflows # Get TTS config
tts_workflows = pixelle_video.tts.list_workflows()
# Build options for selectbox
tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
tts_workflow_keys = [wf["key"] for wf in tts_workflows]
# Default to saved workflow if exists
default_tts_index = 0
comfyui_config = config_manager.get_comfyui_config() comfyui_config = config_manager.get_comfyui_config()
saved_tts_workflow = comfyui_config["tts"]["default_workflow"] tts_config = comfyui_config["tts"]
if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
tts_workflow_display = st.selectbox( # Inference mode selection
"TTS Workflow", tts_mode = st.radio(
tts_workflow_options if tts_workflow_options else ["No TTS workflows found"], tr("tts.inference_mode"),
index=default_tts_index, ["local", "comfyui"],
label_visibility="collapsed", horizontal=True,
key="tts_workflow_select" format_func=lambda x: tr(f"tts.mode.{x}"),
index=0 if tts_config.get("inference_mode", "local") == "local" else 1,
key="tts_inference_mode"
) )
# Get the actual workflow key # Show hint based on mode
if tts_workflow_options: if tts_mode == "local":
tts_selected_index = tts_workflow_options.index(tts_workflow_display) st.caption(tr("tts.mode.local_hint"))
tts_workflow_key = tts_workflow_keys[tts_selected_index]
else: else:
tts_workflow_key = "selfhost/tts_edge.json" # fallback st.caption(tr("tts.mode.comfyui_hint"))
# Reference audio upload (optional, for voice cloning) # ================================================================
ref_audio_file = st.file_uploader( # Local Mode UI
tr("tts.ref_audio"), # ================================================================
type=["mp3", "wav", "flac", "m4a", "aac", "ogg"], if tts_mode == "local":
help=tr("tts.ref_audio_help"), # Import voice configuration
key="ref_audio_upload" from pixelle_video.tts_voices import EDGE_TTS_VOICES, get_voice_display_name
)
# Save uploaded ref_audio to temp file if provided
ref_audio_path = None
if ref_audio_file is not None:
# Audio preview player (directly play uploaded file)
st.audio(ref_audio_file)
# Save to temp directory # Get saved voice from config
import tempfile local_config = tts_config.get("local", {})
temp_dir = Path("temp") saved_voice = local_config.get("voice", "zh-CN-YunjianNeural")
temp_dir.mkdir(exist_ok=True) saved_speed = local_config.get("speed", 1.2)
ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
with open(ref_audio_path, "wb") as f: # Build voice options with i18n
f.write(ref_audio_file.getbuffer()) voice_options = []
voice_ids = []
default_voice_index = 0
for idx, voice_config in enumerate(EDGE_TTS_VOICES):
voice_id = voice_config["id"]
display_name = get_voice_display_name(voice_id, tr, get_language())
voice_options.append(display_name)
voice_ids.append(voice_id)
# Set default index if matches saved voice
if voice_id == saved_voice:
default_voice_index = idx
# Two-column layout: Voice | Speed
voice_col, speed_col = st.columns([1, 1])
with voice_col:
# Voice selector
selected_voice_display = st.selectbox(
tr("tts.voice_selector"),
voice_options,
index=default_voice_index,
key="tts_local_voice"
)
# Get actual voice ID
selected_voice_index = voice_options.index(selected_voice_display)
selected_voice = voice_ids[selected_voice_index]
with speed_col:
# Speed slider
tts_speed = st.slider(
tr("tts.speed"),
min_value=0.5,
max_value=2.0,
value=saved_speed,
step=0.1,
format="%.1fx",
key="tts_local_speed"
)
st.caption(tr("tts.speed_label", speed=f"{tts_speed:.1f}"))
# Variables for video generation
tts_workflow_key = None
ref_audio_path = None
# TTS preview expander (simplified, uses default voice and speed) # ================================================================
# ComfyUI Mode UI
# ================================================================
else: # comfyui mode
# Get available TTS workflows
tts_workflows = pixelle_video.tts.list_workflows()
# Build options for selectbox
tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
tts_workflow_keys = [wf["key"] for wf in tts_workflows]
# Default to saved workflow if exists
default_tts_index = 0
saved_tts_workflow = tts_config.get("comfyui", {}).get("default_workflow")
if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
tts_workflow_display = st.selectbox(
"TTS Workflow",
tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
index=default_tts_index,
label_visibility="collapsed",
key="tts_workflow_select"
)
# Get the actual workflow key
if tts_workflow_options:
tts_selected_index = tts_workflow_options.index(tts_workflow_display)
tts_workflow_key = tts_workflow_keys[tts_selected_index]
else:
tts_workflow_key = "selfhost/tts_edge.json" # fallback
# Reference audio upload (optional, for voice cloning)
ref_audio_file = st.file_uploader(
tr("tts.ref_audio"),
type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
help=tr("tts.ref_audio_help"),
key="ref_audio_upload"
)
# Save uploaded ref_audio to temp file if provided
ref_audio_path = None
if ref_audio_file is not None:
# Audio preview player (directly play uploaded file)
st.audio(ref_audio_file)
# Save to temp directory
temp_dir = Path("temp")
temp_dir.mkdir(exist_ok=True)
ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
with open(ref_audio_path, "wb") as f:
f.write(ref_audio_file.getbuffer())
# Variables for video generation
selected_voice = None
tts_speed = None
# ================================================================
# TTS Preview (works for both modes)
# ================================================================
with st.expander(tr("tts.preview_title"), expanded=False): with st.expander(tr("tts.preview_title"), expanded=False):
# Preview text input # Preview text input
preview_text = st.text_input( preview_text = st.text_input(
@@ -514,14 +602,19 @@ def main():
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True): if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
with st.spinner(tr("tts.previewing")): with st.spinner(tr("tts.previewing")):
try: try:
# Generate preview audio using selected workflow (use default voice and speed) # Build TTS params based on mode
# Pass ref_audio if uploaded
tts_params = { tts_params = {
"text": preview_text, "text": preview_text,
"workflow": tts_workflow_key "inference_mode": tts_mode
} }
if ref_audio_path:
tts_params["ref_audio"] = str(ref_audio_path) if tts_mode == "local":
tts_params["voice"] = selected_voice
tts_params["speed"] = tts_speed
else: # comfyui
tts_params["workflow"] = tts_workflow_key
if ref_audio_path:
tts_params["ref_audio"] = str(ref_audio_path)
audio_path = run_async(pixelle_video.tts(**tts_params)) audio_path = run_async(pixelle_video.tts(**tts_params))
@@ -979,7 +1072,6 @@ def main():
"mode": mode, "mode": mode,
"title": title if title else None, "title": title if title else None,
"n_scenes": n_scenes, "n_scenes": n_scenes,
"tts_workflow": tts_workflow_key,
"image_workflow": workflow_key, "image_workflow": workflow_key,
"image_width": int(image_width), "image_width": int(image_width),
"image_height": int(image_height), "image_height": int(image_height),
@@ -989,14 +1081,20 @@ def main():
"progress_callback": update_progress, "progress_callback": update_progress,
} }
# Add TTS parameters based on mode
video_params["tts_inference_mode"] = tts_mode
if tts_mode == "local":
video_params["tts_voice"] = selected_voice
video_params["tts_speed"] = tts_speed
else: # comfyui
video_params["tts_workflow"] = tts_workflow_key
if ref_audio_path:
video_params["ref_audio"] = str(ref_audio_path)
# Add custom template parameters if any # Add custom template parameters if any
if custom_values_for_video: if custom_values_for_video:
video_params["template_params"] = custom_values_for_video video_params["template_params"] = custom_values_for_video
# Add ref_audio if uploaded
if ref_audio_path:
video_params["ref_audio"] = str(ref_audio_path)
result = run_async(pixelle_video.generate_video(**video_params)) result = run_async(pixelle_video.generate_video(**video_params))
progress_bar.progress(100) progress_bar.progress(100)

View File

@@ -179,6 +179,31 @@
"settings.comfyui.runninghub_api_key": "RunningHub API Key", "settings.comfyui.runninghub_api_key": "RunningHub API Key",
"settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key", "settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key",
"tts.inference_mode": "Synthesis Mode",
"tts.mode.local": "Local Synthesis",
"tts.mode.comfyui": "ComfyUI Synthesis",
"tts.mode.local_hint": "💡 Using Edge TTS, no configuration required, ready to use",
"tts.mode.comfyui_hint": "⚙️ Using ComfyUI workflows, flexible and powerful",
"tts.voice_selector": "Voice Selection",
"tts.speed": "Speed",
"tts.speed_label": "{speed}x",
"tts.voice.zh_CN_XiaoxiaoNeural": "zh-CN-XiaoxiaoNeural",
"tts.voice.zh_CN_XiaoyiNeural": "zh-CN-XiaoyiNeural",
"tts.voice.zh_CN_YunjianNeural": "zh-CN-YunjianNeural",
"tts.voice.zh_CN_YunxiNeural": "zh-CN-YunxiNeural",
"tts.voice.zh_CN_YunyangNeural": "zh-CN-YunyangNeural",
"tts.voice.zh_CN_YunyeNeural": "zh-CN-YunyeNeural",
"tts.voice.zh_CN_YunfengNeural": "zh-CN-YunfengNeural",
"tts.voice.zh_CN_liaoning_XiaobeiNeural": "zh-CN-liaoning-XiaobeiNeural",
"tts.voice.en_US_AriaNeural": "en-US-AriaNeural",
"tts.voice.en_US_JennyNeural": "en-US-JennyNeural",
"tts.voice.en_US_GuyNeural": "en-US-GuyNeural",
"tts.voice.en_US_DavisNeural": "en-US-DavisNeural",
"tts.voice.en_GB_SoniaNeural": "en-GB-SoniaNeural",
"tts.voice.en_GB_RyanNeural": "en-GB-RyanNeural",
"tts.selector": "Workflow Selection", "tts.selector": "Workflow Selection",
"tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)", "tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)",
"tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder", "tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder",

View File

@@ -179,6 +179,31 @@
"settings.comfyui.runninghub_api_key": "RunningHub API 密钥", "settings.comfyui.runninghub_api_key": "RunningHub API 密钥",
"settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key", "settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key",
"tts.inference_mode": "合成方式",
"tts.mode.local": "本地合成",
"tts.mode.comfyui": "ComfyUI 合成",
"tts.mode.local_hint": "💡 使用 Edge TTS无需配置开箱即用请确保网络环境可用",
"tts.mode.comfyui_hint": "⚙️ 使用 ComfyUI 工作流,灵活强大",
"tts.voice_selector": "音色选择",
"tts.speed": "语速",
"tts.speed_label": "{speed}x",
"tts.voice.zh_CN_XiaoxiaoNeural": "女声-温柔(晓晓)",
"tts.voice.zh_CN_XiaoyiNeural": "女声-甜美(晓伊)",
"tts.voice.zh_CN_YunjianNeural": "男声-专业(云健)",
"tts.voice.zh_CN_YunxiNeural": "男声-磁性(云希)",
"tts.voice.zh_CN_YunyangNeural": "男声-新闻(云扬)",
"tts.voice.zh_CN_YunyeNeural": "男声-自然(云野)",
"tts.voice.zh_CN_YunfengNeural": "男声-沉稳(云锋)",
"tts.voice.zh_CN_liaoning_XiaobeiNeural": "女声-东北(小北)",
"tts.voice.en_US_AriaNeural": "女声-自然Aria",
"tts.voice.en_US_JennyNeural": "女声-温暖Jenny",
"tts.voice.en_US_GuyNeural": "男声-标准Guy",
"tts.voice.en_US_DavisNeural": "男声-友好Davis",
"tts.voice.en_GB_SoniaNeural": "女声-英式Sonia",
"tts.voice.en_GB_RyanNeural": "男声-英式Ryan",
"tts.selector": "工作流选择", "tts.selector": "工作流选择",
"tts.what": "将旁白文本转换为真人般的自然语音(部分工作流支持参考音频克隆声音)", "tts.what": "将旁白文本转换为真人般的自然语音(部分工作流支持参考音频克隆声音)",
"tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/(本地 ComfyUI或 workflows/runninghub/(云端)文件夹", "tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/(本地 ComfyUI或 workflows/runninghub/(云端)文件夹",