tts支持本地合成
This commit is contained in:
206
web/app.py
206
web/app.py
@@ -449,58 +449,146 @@ def main():
|
||||
st.markdown(f"**{tr('help.how')}**")
|
||||
st.markdown(tr("tts.how"))
|
||||
|
||||
# Get available TTS workflows
|
||||
tts_workflows = pixelle_video.tts.list_workflows()
|
||||
|
||||
# Build options for selectbox
|
||||
tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
|
||||
tts_workflow_keys = [wf["key"] for wf in tts_workflows]
|
||||
|
||||
# Default to saved workflow if exists
|
||||
default_tts_index = 0
|
||||
# Get TTS config
|
||||
comfyui_config = config_manager.get_comfyui_config()
|
||||
saved_tts_workflow = comfyui_config["tts"]["default_workflow"]
|
||||
if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
|
||||
default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
|
||||
tts_config = comfyui_config["tts"]
|
||||
|
||||
tts_workflow_display = st.selectbox(
|
||||
"TTS Workflow",
|
||||
tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
|
||||
index=default_tts_index,
|
||||
label_visibility="collapsed",
|
||||
key="tts_workflow_select"
|
||||
# Inference mode selection
|
||||
tts_mode = st.radio(
|
||||
tr("tts.inference_mode"),
|
||||
["local", "comfyui"],
|
||||
horizontal=True,
|
||||
format_func=lambda x: tr(f"tts.mode.{x}"),
|
||||
index=0 if tts_config.get("inference_mode", "local") == "local" else 1,
|
||||
key="tts_inference_mode"
|
||||
)
|
||||
|
||||
# Get the actual workflow key
|
||||
if tts_workflow_options:
|
||||
tts_selected_index = tts_workflow_options.index(tts_workflow_display)
|
||||
tts_workflow_key = tts_workflow_keys[tts_selected_index]
|
||||
# Show hint based on mode
|
||||
if tts_mode == "local":
|
||||
st.caption(tr("tts.mode.local_hint"))
|
||||
else:
|
||||
tts_workflow_key = "selfhost/tts_edge.json" # fallback
|
||||
st.caption(tr("tts.mode.comfyui_hint"))
|
||||
|
||||
# Reference audio upload (optional, for voice cloning)
|
||||
ref_audio_file = st.file_uploader(
|
||||
tr("tts.ref_audio"),
|
||||
type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
|
||||
help=tr("tts.ref_audio_help"),
|
||||
key="ref_audio_upload"
|
||||
)
|
||||
|
||||
# Save uploaded ref_audio to temp file if provided
|
||||
ref_audio_path = None
|
||||
if ref_audio_file is not None:
|
||||
# Audio preview player (directly play uploaded file)
|
||||
st.audio(ref_audio_file)
|
||||
# ================================================================
|
||||
# Local Mode UI
|
||||
# ================================================================
|
||||
if tts_mode == "local":
|
||||
# Import voice configuration
|
||||
from pixelle_video.tts_voices import EDGE_TTS_VOICES, get_voice_display_name
|
||||
|
||||
# Save to temp directory
|
||||
import tempfile
|
||||
temp_dir = Path("temp")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
|
||||
with open(ref_audio_path, "wb") as f:
|
||||
f.write(ref_audio_file.getbuffer())
|
||||
# Get saved voice from config
|
||||
local_config = tts_config.get("local", {})
|
||||
saved_voice = local_config.get("voice", "zh-CN-YunjianNeural")
|
||||
saved_speed = local_config.get("speed", 1.2)
|
||||
|
||||
# Build voice options with i18n
|
||||
voice_options = []
|
||||
voice_ids = []
|
||||
default_voice_index = 0
|
||||
|
||||
for idx, voice_config in enumerate(EDGE_TTS_VOICES):
|
||||
voice_id = voice_config["id"]
|
||||
display_name = get_voice_display_name(voice_id, tr, get_language())
|
||||
voice_options.append(display_name)
|
||||
voice_ids.append(voice_id)
|
||||
|
||||
# Set default index if matches saved voice
|
||||
if voice_id == saved_voice:
|
||||
default_voice_index = idx
|
||||
|
||||
# Two-column layout: Voice | Speed
|
||||
voice_col, speed_col = st.columns([1, 1])
|
||||
|
||||
with voice_col:
|
||||
# Voice selector
|
||||
selected_voice_display = st.selectbox(
|
||||
tr("tts.voice_selector"),
|
||||
voice_options,
|
||||
index=default_voice_index,
|
||||
key="tts_local_voice"
|
||||
)
|
||||
|
||||
# Get actual voice ID
|
||||
selected_voice_index = voice_options.index(selected_voice_display)
|
||||
selected_voice = voice_ids[selected_voice_index]
|
||||
|
||||
with speed_col:
|
||||
# Speed slider
|
||||
tts_speed = st.slider(
|
||||
tr("tts.speed"),
|
||||
min_value=0.5,
|
||||
max_value=2.0,
|
||||
value=saved_speed,
|
||||
step=0.1,
|
||||
format="%.1fx",
|
||||
key="tts_local_speed"
|
||||
)
|
||||
st.caption(tr("tts.speed_label", speed=f"{tts_speed:.1f}"))
|
||||
|
||||
# Variables for video generation
|
||||
tts_workflow_key = None
|
||||
ref_audio_path = None
|
||||
|
||||
# TTS preview expander (simplified, uses default voice and speed)
|
||||
# ================================================================
|
||||
# ComfyUI Mode UI
|
||||
# ================================================================
|
||||
else: # comfyui mode
|
||||
# Get available TTS workflows
|
||||
tts_workflows = pixelle_video.tts.list_workflows()
|
||||
|
||||
# Build options for selectbox
|
||||
tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
|
||||
tts_workflow_keys = [wf["key"] for wf in tts_workflows]
|
||||
|
||||
# Default to saved workflow if exists
|
||||
default_tts_index = 0
|
||||
saved_tts_workflow = tts_config.get("comfyui", {}).get("default_workflow")
|
||||
if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
|
||||
default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
|
||||
|
||||
tts_workflow_display = st.selectbox(
|
||||
"TTS Workflow",
|
||||
tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
|
||||
index=default_tts_index,
|
||||
label_visibility="collapsed",
|
||||
key="tts_workflow_select"
|
||||
)
|
||||
|
||||
# Get the actual workflow key
|
||||
if tts_workflow_options:
|
||||
tts_selected_index = tts_workflow_options.index(tts_workflow_display)
|
||||
tts_workflow_key = tts_workflow_keys[tts_selected_index]
|
||||
else:
|
||||
tts_workflow_key = "selfhost/tts_edge.json" # fallback
|
||||
|
||||
# Reference audio upload (optional, for voice cloning)
|
||||
ref_audio_file = st.file_uploader(
|
||||
tr("tts.ref_audio"),
|
||||
type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
|
||||
help=tr("tts.ref_audio_help"),
|
||||
key="ref_audio_upload"
|
||||
)
|
||||
|
||||
# Save uploaded ref_audio to temp file if provided
|
||||
ref_audio_path = None
|
||||
if ref_audio_file is not None:
|
||||
# Audio preview player (directly play uploaded file)
|
||||
st.audio(ref_audio_file)
|
||||
|
||||
# Save to temp directory
|
||||
temp_dir = Path("temp")
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
|
||||
with open(ref_audio_path, "wb") as f:
|
||||
f.write(ref_audio_file.getbuffer())
|
||||
|
||||
# Variables for video generation
|
||||
selected_voice = None
|
||||
tts_speed = None
|
||||
|
||||
# ================================================================
|
||||
# TTS Preview (works for both modes)
|
||||
# ================================================================
|
||||
with st.expander(tr("tts.preview_title"), expanded=False):
|
||||
# Preview text input
|
||||
preview_text = st.text_input(
|
||||
@@ -514,14 +602,19 @@ def main():
|
||||
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
|
||||
with st.spinner(tr("tts.previewing")):
|
||||
try:
|
||||
# Generate preview audio using selected workflow (use default voice and speed)
|
||||
# Pass ref_audio if uploaded
|
||||
# Build TTS params based on mode
|
||||
tts_params = {
|
||||
"text": preview_text,
|
||||
"workflow": tts_workflow_key
|
||||
"inference_mode": tts_mode
|
||||
}
|
||||
if ref_audio_path:
|
||||
tts_params["ref_audio"] = str(ref_audio_path)
|
||||
|
||||
if tts_mode == "local":
|
||||
tts_params["voice"] = selected_voice
|
||||
tts_params["speed"] = tts_speed
|
||||
else: # comfyui
|
||||
tts_params["workflow"] = tts_workflow_key
|
||||
if ref_audio_path:
|
||||
tts_params["ref_audio"] = str(ref_audio_path)
|
||||
|
||||
audio_path = run_async(pixelle_video.tts(**tts_params))
|
||||
|
||||
@@ -979,7 +1072,6 @@ def main():
|
||||
"mode": mode,
|
||||
"title": title if title else None,
|
||||
"n_scenes": n_scenes,
|
||||
"tts_workflow": tts_workflow_key,
|
||||
"image_workflow": workflow_key,
|
||||
"image_width": int(image_width),
|
||||
"image_height": int(image_height),
|
||||
@@ -989,14 +1081,20 @@ def main():
|
||||
"progress_callback": update_progress,
|
||||
}
|
||||
|
||||
# Add TTS parameters based on mode
|
||||
video_params["tts_inference_mode"] = tts_mode
|
||||
if tts_mode == "local":
|
||||
video_params["tts_voice"] = selected_voice
|
||||
video_params["tts_speed"] = tts_speed
|
||||
else: # comfyui
|
||||
video_params["tts_workflow"] = tts_workflow_key
|
||||
if ref_audio_path:
|
||||
video_params["ref_audio"] = str(ref_audio_path)
|
||||
|
||||
# Add custom template parameters if any
|
||||
if custom_values_for_video:
|
||||
video_params["template_params"] = custom_values_for_video
|
||||
|
||||
# Add ref_audio if uploaded
|
||||
if ref_audio_path:
|
||||
video_params["ref_audio"] = str(ref_audio_path)
|
||||
|
||||
result = run_async(pixelle_video.generate_video(**video_params))
|
||||
|
||||
progress_bar.progress(100)
|
||||
|
||||
@@ -179,6 +179,31 @@
|
||||
"settings.comfyui.runninghub_api_key": "RunningHub API Key",
|
||||
"settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key",
|
||||
|
||||
"tts.inference_mode": "Synthesis Mode",
|
||||
"tts.mode.local": "Local Synthesis",
|
||||
"tts.mode.comfyui": "ComfyUI Synthesis",
|
||||
"tts.mode.local_hint": "💡 Using Edge TTS, no configuration required, ready to use",
|
||||
"tts.mode.comfyui_hint": "⚙️ Using ComfyUI workflows, flexible and powerful",
|
||||
|
||||
"tts.voice_selector": "Voice Selection",
|
||||
"tts.speed": "Speed",
|
||||
"tts.speed_label": "{speed}x",
|
||||
|
||||
"tts.voice.zh_CN_XiaoxiaoNeural": "zh-CN-XiaoxiaoNeural",
|
||||
"tts.voice.zh_CN_XiaoyiNeural": "zh-CN-XiaoyiNeural",
|
||||
"tts.voice.zh_CN_YunjianNeural": "zh-CN-YunjianNeural",
|
||||
"tts.voice.zh_CN_YunxiNeural": "zh-CN-YunxiNeural",
|
||||
"tts.voice.zh_CN_YunyangNeural": "zh-CN-YunyangNeural",
|
||||
"tts.voice.zh_CN_YunyeNeural": "zh-CN-YunyeNeural",
|
||||
"tts.voice.zh_CN_YunfengNeural": "zh-CN-YunfengNeural",
|
||||
"tts.voice.zh_CN_liaoning_XiaobeiNeural": "zh-CN-liaoning-XiaobeiNeural",
|
||||
"tts.voice.en_US_AriaNeural": "en-US-AriaNeural",
|
||||
"tts.voice.en_US_JennyNeural": "en-US-JennyNeural",
|
||||
"tts.voice.en_US_GuyNeural": "en-US-GuyNeural",
|
||||
"tts.voice.en_US_DavisNeural": "en-US-DavisNeural",
|
||||
"tts.voice.en_GB_SoniaNeural": "en-GB-SoniaNeural",
|
||||
"tts.voice.en_GB_RyanNeural": "en-GB-RyanNeural",
|
||||
|
||||
"tts.selector": "Workflow Selection",
|
||||
"tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)",
|
||||
"tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder",
|
||||
|
||||
@@ -179,6 +179,31 @@
|
||||
"settings.comfyui.runninghub_api_key": "RunningHub API 密钥",
|
||||
"settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key",
|
||||
|
||||
"tts.inference_mode": "合成方式",
|
||||
"tts.mode.local": "本地合成",
|
||||
"tts.mode.comfyui": "ComfyUI 合成",
|
||||
"tts.mode.local_hint": "💡 使用 Edge TTS,无需配置,开箱即用(请确保网络环境可用)",
|
||||
"tts.mode.comfyui_hint": "⚙️ 使用 ComfyUI 工作流,灵活强大",
|
||||
|
||||
"tts.voice_selector": "音色选择",
|
||||
"tts.speed": "语速",
|
||||
"tts.speed_label": "{speed}x",
|
||||
|
||||
"tts.voice.zh_CN_XiaoxiaoNeural": "女声-温柔(晓晓)",
|
||||
"tts.voice.zh_CN_XiaoyiNeural": "女声-甜美(晓伊)",
|
||||
"tts.voice.zh_CN_YunjianNeural": "男声-专业(云健)",
|
||||
"tts.voice.zh_CN_YunxiNeural": "男声-磁性(云希)",
|
||||
"tts.voice.zh_CN_YunyangNeural": "男声-新闻(云扬)",
|
||||
"tts.voice.zh_CN_YunyeNeural": "男声-自然(云野)",
|
||||
"tts.voice.zh_CN_YunfengNeural": "男声-沉稳(云锋)",
|
||||
"tts.voice.zh_CN_liaoning_XiaobeiNeural": "女声-东北(小北)",
|
||||
"tts.voice.en_US_AriaNeural": "女声-自然(Aria)",
|
||||
"tts.voice.en_US_JennyNeural": "女声-温暖(Jenny)",
|
||||
"tts.voice.en_US_GuyNeural": "男声-标准(Guy)",
|
||||
"tts.voice.en_US_DavisNeural": "男声-友好(Davis)",
|
||||
"tts.voice.en_GB_SoniaNeural": "女声-英式(Sonia)",
|
||||
"tts.voice.en_GB_RyanNeural": "男声-英式(Ryan)",
|
||||
|
||||
"tts.selector": "工作流选择",
|
||||
"tts.what": "将旁白文本转换为真人般的自然语音(部分工作流支持参考音频克隆声音)",
|
||||
"tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/(本地 ComfyUI)或 workflows/runninghub/(云端)文件夹",
|
||||
|
||||
Reference in New Issue
Block a user