tts支持本地合成
This commit is contained in:
@@ -14,9 +14,28 @@ class LLMConfig(BaseModel):
|
|||||||
model: str = Field(default="", description="LLM Model Name")
|
model: str = Field(default="", description="LLM Model Name")
|
||||||
|
|
||||||
|
|
||||||
|
class TTSLocalConfig(BaseModel):
|
||||||
|
"""Local TTS configuration (Edge TTS)"""
|
||||||
|
voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID")
|
||||||
|
speed: float = Field(default=1.2, ge=0.5, le=2.0, description="Speech speed multiplier (0.5-2.0)")
|
||||||
|
|
||||||
|
|
||||||
|
class TTSComfyUIConfig(BaseModel):
|
||||||
|
"""ComfyUI TTS configuration"""
|
||||||
|
default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
|
||||||
|
|
||||||
|
|
||||||
class TTSSubConfig(BaseModel):
|
class TTSSubConfig(BaseModel):
|
||||||
"""TTS-specific configuration (under comfyui.tts)"""
|
"""TTS-specific configuration (under comfyui.tts)"""
|
||||||
default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
|
inference_mode: str = Field(default="local", description="TTS inference mode: 'local' or 'comfyui'")
|
||||||
|
local: TTSLocalConfig = Field(default_factory=TTSLocalConfig, description="Local TTS (Edge TTS) configuration")
|
||||||
|
comfyui: TTSComfyUIConfig = Field(default_factory=TTSComfyUIConfig, description="ComfyUI TTS configuration")
|
||||||
|
|
||||||
|
# Backward compatibility: keep default_workflow at top level
|
||||||
|
@property
|
||||||
|
def default_workflow(self) -> Optional[str]:
|
||||||
|
"""Get default workflow (for backward compatibility)"""
|
||||||
|
return self.comfyui.default_workflow
|
||||||
|
|
||||||
|
|
||||||
class ImageSubConfig(BaseModel):
|
class ImageSubConfig(BaseModel):
|
||||||
|
|||||||
@@ -24,10 +24,11 @@ class StoryboardConfig:
|
|||||||
video_fps: int = 30 # Frame rate
|
video_fps: int = 30 # Frame rate
|
||||||
|
|
||||||
# Audio parameters
|
# Audio parameters
|
||||||
voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice
|
tts_inference_mode: str = "local" # TTS inference mode: "local" or "comfyui"
|
||||||
tts_workflow: Optional[str] = None # TTS workflow filename (None = use default)
|
voice_id: Optional[str] = None # Voice ID (for local: Edge TTS voice ID; for comfyui: workflow-specific)
|
||||||
tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster)
|
tts_workflow: Optional[str] = None # TTS workflow filename (for ComfyUI mode, None = use default)
|
||||||
ref_audio: Optional[str] = None # Reference audio for voice cloning (only some workflows support this)
|
tts_speed: Optional[float] = None # TTS speed multiplier (0.5-2.0, 1.0 = normal)
|
||||||
|
ref_audio: Optional[str] = None # Reference audio for voice cloning (ComfyUI mode only)
|
||||||
|
|
||||||
# Image parameters
|
# Image parameters
|
||||||
image_width: int = 1024
|
image_width: int = 1024
|
||||||
|
|||||||
@@ -62,10 +62,17 @@ class StandardPipeline(BasePipeline):
|
|||||||
|
|
||||||
# === Basic Config ===
|
# === Basic Config ===
|
||||||
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
|
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
|
||||||
voice_id: str = "[Chinese] zh-CN Yunjian",
|
|
||||||
tts_workflow: Optional[str] = None,
|
# === TTS Parameters ===
|
||||||
tts_speed: float = 1.2,
|
tts_inference_mode: Optional[str] = None, # "local" or "comfyui"
|
||||||
ref_audio: Optional[str] = None, # Reference audio for voice cloning
|
tts_voice: Optional[str] = None, # For local mode: Edge TTS voice ID
|
||||||
|
tts_speed: Optional[float] = None, # Speed multiplier (0.5-2.0)
|
||||||
|
tts_workflow: Optional[str] = None, # For ComfyUI mode: workflow path
|
||||||
|
ref_audio: Optional[str] = None, # For ComfyUI mode: reference audio
|
||||||
|
|
||||||
|
# Deprecated (kept for backward compatibility)
|
||||||
|
voice_id: Optional[str] = None,
|
||||||
|
|
||||||
output_path: Optional[str] = None,
|
output_path: Optional[str] = None,
|
||||||
|
|
||||||
# === LLM Parameters ===
|
# === LLM Parameters ===
|
||||||
@@ -191,6 +198,29 @@ class StandardPipeline(BasePipeline):
|
|||||||
output_path = get_task_final_video_path(task_id)
|
output_path = get_task_final_video_path(task_id)
|
||||||
logger.info(f" Will copy final video to: {user_specified_output}")
|
logger.info(f" Will copy final video to: {user_specified_output}")
|
||||||
|
|
||||||
|
# Determine TTS inference mode and parameters
|
||||||
|
# Priority: explicit params > backward compatibility > config defaults
|
||||||
|
if tts_inference_mode is None:
|
||||||
|
# Check if user provided ComfyUI-specific params
|
||||||
|
if tts_workflow is not None or ref_audio is not None:
|
||||||
|
tts_inference_mode = "comfyui"
|
||||||
|
# Check if user provided old voice_id param (backward compatibility)
|
||||||
|
elif voice_id is not None:
|
||||||
|
tts_inference_mode = "comfyui"
|
||||||
|
if tts_voice is None:
|
||||||
|
tts_voice = voice_id
|
||||||
|
else:
|
||||||
|
# Use config default
|
||||||
|
tts_config = self.core.config.get("comfyui", {}).get("tts", {})
|
||||||
|
tts_inference_mode = tts_config.get("inference_mode", "local")
|
||||||
|
|
||||||
|
# Set voice_id based on mode for StoryboardConfig
|
||||||
|
final_voice_id = None
|
||||||
|
if tts_inference_mode == "local":
|
||||||
|
final_voice_id = tts_voice or voice_id
|
||||||
|
else: # comfyui
|
||||||
|
final_voice_id = voice_id # For ComfyUI, might be None
|
||||||
|
|
||||||
# Create storyboard config
|
# Create storyboard config
|
||||||
config = StoryboardConfig(
|
config = StoryboardConfig(
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
@@ -200,7 +230,8 @@ class StandardPipeline(BasePipeline):
|
|||||||
min_image_prompt_words=min_image_prompt_words,
|
min_image_prompt_words=min_image_prompt_words,
|
||||||
max_image_prompt_words=max_image_prompt_words,
|
max_image_prompt_words=max_image_prompt_words,
|
||||||
video_fps=video_fps,
|
video_fps=video_fps,
|
||||||
voice_id=voice_id,
|
tts_inference_mode=tts_inference_mode,
|
||||||
|
voice_id=final_voice_id,
|
||||||
tts_workflow=tts_workflow,
|
tts_workflow=tts_workflow,
|
||||||
tts_speed=tts_speed,
|
tts_speed=tts_speed,
|
||||||
ref_audio=ref_audio,
|
ref_audio=ref_audio,
|
||||||
|
|||||||
@@ -124,18 +124,29 @@ class FrameProcessor:
|
|||||||
from pixelle_video.utils.os_util import get_task_frame_path
|
from pixelle_video.utils.os_util import get_task_frame_path
|
||||||
output_path = get_task_frame_path(config.task_id, frame.index, "audio")
|
output_path = get_task_frame_path(config.task_id, frame.index, "audio")
|
||||||
|
|
||||||
# Call TTS with specific output path and workflow
|
# Build TTS params based on inference mode
|
||||||
tts_params = {
|
tts_params = {
|
||||||
"text": frame.narration,
|
"text": frame.narration,
|
||||||
"workflow": config.tts_workflow,
|
"inference_mode": config.tts_inference_mode,
|
||||||
"voice": config.voice_id,
|
|
||||||
"speed": config.tts_speed,
|
|
||||||
"output_path": output_path,
|
"output_path": output_path,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Add ref_audio if provided
|
if config.tts_inference_mode == "local":
|
||||||
if config.ref_audio:
|
# Local mode: pass voice and speed
|
||||||
tts_params["ref_audio"] = config.ref_audio
|
if config.voice_id:
|
||||||
|
tts_params["voice"] = config.voice_id
|
||||||
|
if config.tts_speed is not None:
|
||||||
|
tts_params["speed"] = config.tts_speed
|
||||||
|
else: # comfyui
|
||||||
|
# ComfyUI mode: pass workflow, voice, speed, and ref_audio
|
||||||
|
if config.tts_workflow:
|
||||||
|
tts_params["workflow"] = config.tts_workflow
|
||||||
|
if config.voice_id:
|
||||||
|
tts_params["voice"] = config.voice_id
|
||||||
|
if config.tts_speed is not None:
|
||||||
|
tts_params["speed"] = config.tts_speed
|
||||||
|
if config.ref_audio:
|
||||||
|
tts_params["ref_audio"] = config.ref_audio
|
||||||
|
|
||||||
audio_path = await self.core.tts(**tts_params)
|
audio_path = await self.core.tts(**tts_params)
|
||||||
|
|
||||||
|
|||||||
@@ -1,13 +1,18 @@
|
|||||||
"""
|
"""
|
||||||
TTS (Text-to-Speech) Service - ComfyUI Workflow-based implementation
|
TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from comfykit import ComfyKit
|
from comfykit import ComfyKit
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from pixelle_video.services.comfy_base_service import ComfyBaseService
|
from pixelle_video.services.comfy_base_service import ComfyBaseService
|
||||||
|
from pixelle_video.utils.tts_util import edge_tts
|
||||||
|
from pixelle_video.tts_voices import speed_to_rate
|
||||||
|
|
||||||
|
|
||||||
class TTSService(ComfyBaseService):
|
class TTSService(ComfyBaseService):
|
||||||
@@ -52,22 +57,25 @@ class TTSService(ComfyBaseService):
|
|||||||
comfyui_url: Optional[str] = None,
|
comfyui_url: Optional[str] = None,
|
||||||
runninghub_api_key: Optional[str] = None,
|
runninghub_api_key: Optional[str] = None,
|
||||||
# TTS parameters
|
# TTS parameters
|
||||||
voice: str = "[Chinese] zh-CN Yunjian",
|
voice: Optional[str] = None,
|
||||||
speed: float = 1.2,
|
speed: Optional[float] = None,
|
||||||
|
# Inference mode override
|
||||||
|
inference_mode: Optional[str] = None,
|
||||||
# Output path
|
# Output path
|
||||||
output_path: Optional[str] = None,
|
output_path: Optional[str] = None,
|
||||||
**params
|
**params
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Generate speech using ComfyUI workflow
|
Generate speech using local Edge TTS or ComfyUI workflow
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to convert to speech
|
text: Text to convert to speech
|
||||||
workflow: Workflow filename (default: from config)
|
workflow: Workflow filename (for ComfyUI mode, default: from config)
|
||||||
comfyui_url: ComfyUI URL (optional, overrides config)
|
comfyui_url: ComfyUI URL (optional, overrides config)
|
||||||
runninghub_api_key: RunningHub API key (optional, overrides config)
|
runninghub_api_key: RunningHub API key (optional, overrides config)
|
||||||
voice: Voice ID (workflow-specific)
|
voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
|
||||||
speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
|
speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
|
||||||
|
inference_mode: Override inference mode ("local" or "comfyui", default: from config)
|
||||||
output_path: Custom output path (auto-generated if None)
|
output_path: Custom output path (auto-generated if None)
|
||||||
**params: Additional workflow parameters
|
**params: Additional workflow parameters
|
||||||
|
|
||||||
@@ -75,49 +83,103 @@ class TTSService(ComfyBaseService):
|
|||||||
Generated audio file path
|
Generated audio file path
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
# Simplest: use default workflow
|
# Local inference (Edge TTS)
|
||||||
audio_path = await pixelle_video.tts(text="Hello, world!")
|
|
||||||
|
|
||||||
# Use specific workflow
|
|
||||||
audio_path = await pixelle_video.tts(
|
audio_path = await pixelle_video.tts(
|
||||||
text="你好,世界!",
|
text="Hello, world!",
|
||||||
workflow="tts_edge.json"
|
inference_mode="local",
|
||||||
)
|
voice="zh-CN-YunjianNeural",
|
||||||
|
|
||||||
# With voice and speed
|
|
||||||
audio_path = await pixelle_video.tts(
|
|
||||||
text="Hello",
|
|
||||||
workflow="tts_edge.json",
|
|
||||||
voice="[Chinese] zh-CN Xiaoxiao",
|
|
||||||
speed=1.2
|
speed=1.2
|
||||||
)
|
)
|
||||||
|
|
||||||
# With absolute path
|
# ComfyUI inference
|
||||||
audio_path = await pixelle_video.tts(
|
audio_path = await pixelle_video.tts(
|
||||||
text="Hello",
|
text="你好,世界!",
|
||||||
workflow="/path/to/custom_tts.json"
|
inference_mode="comfyui",
|
||||||
)
|
workflow="runninghub/tts_edge.json"
|
||||||
|
|
||||||
# With custom ComfyUI server
|
|
||||||
audio_path = await pixelle_video.tts(
|
|
||||||
text="Hello",
|
|
||||||
comfyui_url="http://192.168.1.100:8188"
|
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
# 1. Resolve workflow (returns structured info)
|
# Determine inference mode (param > config)
|
||||||
workflow_info = self._resolve_workflow(workflow=workflow)
|
mode = inference_mode or self.config.get("inference_mode", "local")
|
||||||
|
|
||||||
# 2. Execute ComfyUI workflow
|
# Route to appropriate implementation
|
||||||
return await self._call_comfyui_workflow(
|
if mode == "local":
|
||||||
workflow_info=workflow_info,
|
return await self._call_local_tts(
|
||||||
text=text,
|
text=text,
|
||||||
comfyui_url=comfyui_url,
|
voice=voice,
|
||||||
runninghub_api_key=runninghub_api_key,
|
speed=speed,
|
||||||
voice=voice,
|
output_path=output_path
|
||||||
speed=speed,
|
)
|
||||||
output_path=output_path,
|
else: # comfyui
|
||||||
**params
|
# 1. Resolve workflow (returns structured info)
|
||||||
)
|
workflow_info = self._resolve_workflow(workflow=workflow)
|
||||||
|
|
||||||
|
# 2. Execute ComfyUI workflow
|
||||||
|
return await self._call_comfyui_workflow(
|
||||||
|
workflow_info=workflow_info,
|
||||||
|
text=text,
|
||||||
|
comfyui_url=comfyui_url,
|
||||||
|
runninghub_api_key=runninghub_api_key,
|
||||||
|
voice=voice,
|
||||||
|
speed=speed,
|
||||||
|
output_path=output_path,
|
||||||
|
**params
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _call_local_tts(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
voice: Optional[str] = None,
|
||||||
|
speed: Optional[float] = None,
|
||||||
|
output_path: Optional[str] = None,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate speech using local Edge TTS
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to convert to speech
|
||||||
|
voice: Edge TTS voice ID (default: from config)
|
||||||
|
speed: Speech speed multiplier (default: from config)
|
||||||
|
output_path: Custom output path (auto-generated if None)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generated audio file path
|
||||||
|
"""
|
||||||
|
# Get config defaults
|
||||||
|
local_config = self.config.get("local", {})
|
||||||
|
|
||||||
|
# Determine voice and speed (param > config)
|
||||||
|
final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
|
||||||
|
final_speed = speed if speed is not None else local_config.get("speed", 1.2)
|
||||||
|
|
||||||
|
# Convert speed to rate parameter
|
||||||
|
rate = speed_to_rate(final_speed)
|
||||||
|
|
||||||
|
logger.info(f"🎙️ Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
|
||||||
|
|
||||||
|
# Generate output path if not provided
|
||||||
|
if not output_path:
|
||||||
|
# Generate unique filename
|
||||||
|
unique_id = uuid.uuid4().hex
|
||||||
|
output_path = f"output/{unique_id}.mp3"
|
||||||
|
|
||||||
|
# Ensure output directory exists
|
||||||
|
Path("output").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Call Edge TTS
|
||||||
|
try:
|
||||||
|
audio_bytes = await edge_tts(
|
||||||
|
text=text,
|
||||||
|
voice=final_voice,
|
||||||
|
rate=rate,
|
||||||
|
output_path=output_path
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Local TTS generation error: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
async def _call_comfyui_workflow(
|
async def _call_comfyui_workflow(
|
||||||
self,
|
self,
|
||||||
|
|||||||
147
pixelle_video/tts_voices.py
Normal file
147
pixelle_video/tts_voices.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""
|
||||||
|
TTS Voice Configuration
|
||||||
|
|
||||||
|
Defines available voices for local Edge TTS inference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
# Edge TTS voice presets for local inference
|
||||||
|
EDGE_TTS_VOICES: List[Dict[str, Any]] = [
|
||||||
|
# Chinese voices
|
||||||
|
{
|
||||||
|
"id": "zh-CN-XiaoxiaoNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_XiaoxiaoNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "female"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-XiaoyiNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_XiaoyiNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "female"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-YunjianNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_YunjianNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-YunxiNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_YunxiNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-YunyangNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_YunyangNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-YunyeNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_YunyeNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-YunfengNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_YunfengNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "zh-CN-liaoning-XiaobeiNeural",
|
||||||
|
"label_key": "tts.voice.zh_CN_liaoning_XiaobeiNeural",
|
||||||
|
"locale": "zh-CN",
|
||||||
|
"gender": "female"
|
||||||
|
},
|
||||||
|
|
||||||
|
# English voices
|
||||||
|
{
|
||||||
|
"id": "en-US-AriaNeural",
|
||||||
|
"label_key": "tts.voice.en_US_AriaNeural",
|
||||||
|
"locale": "en-US",
|
||||||
|
"gender": "female"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "en-US-JennyNeural",
|
||||||
|
"label_key": "tts.voice.en_US_JennyNeural",
|
||||||
|
"locale": "en-US",
|
||||||
|
"gender": "female"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "en-US-GuyNeural",
|
||||||
|
"label_key": "tts.voice.en_US_GuyNeural",
|
||||||
|
"locale": "en-US",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "en-US-DavisNeural",
|
||||||
|
"label_key": "tts.voice.en_US_DavisNeural",
|
||||||
|
"locale": "en-US",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "en-GB-SoniaNeural",
|
||||||
|
"label_key": "tts.voice.en_GB_SoniaNeural",
|
||||||
|
"locale": "en-GB",
|
||||||
|
"gender": "female"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "en-GB-RyanNeural",
|
||||||
|
"label_key": "tts.voice.en_GB_RyanNeural",
|
||||||
|
"locale": "en-GB",
|
||||||
|
"gender": "male"
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_voice_display_name(voice_id: str, tr_func=None, locale: str = "zh_CN") -> str:
|
||||||
|
"""
|
||||||
|
Get display name for voice
|
||||||
|
|
||||||
|
Args:
|
||||||
|
voice_id: Voice ID (e.g., "zh-CN-YunjianNeural")
|
||||||
|
tr_func: Translation function (optional)
|
||||||
|
locale: Current locale (default: "zh_CN")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Display name (translated label if in Chinese, otherwise voice ID)
|
||||||
|
"""
|
||||||
|
# Find voice config
|
||||||
|
voice_config = next((v for v in EDGE_TTS_VOICES if v["id"] == voice_id), None)
|
||||||
|
|
||||||
|
if not voice_config:
|
||||||
|
return voice_id
|
||||||
|
|
||||||
|
# If Chinese locale and translation function available, use translated label
|
||||||
|
if locale == "zh_CN" and tr_func:
|
||||||
|
label_key = voice_config["label_key"]
|
||||||
|
return tr_func(label_key)
|
||||||
|
|
||||||
|
# For other locales, return voice ID
|
||||||
|
return voice_id
|
||||||
|
|
||||||
|
|
||||||
|
def speed_to_rate(speed: float) -> str:
|
||||||
|
"""
|
||||||
|
Convert speed multiplier to Edge TTS rate parameter
|
||||||
|
|
||||||
|
Args:
|
||||||
|
speed: Speed multiplier (1.0 = normal, 1.2 = 120%)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Rate string (e.g., "+20%", "-10%")
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
1.0 → "+0%"
|
||||||
|
1.2 → "+20%"
|
||||||
|
0.8 → "-20%"
|
||||||
|
"""
|
||||||
|
percentage = int((speed - 1.0) * 100)
|
||||||
|
sign = "+" if percentage >= 0 else ""
|
||||||
|
return f"{sign}{percentage}%"
|
||||||
|
|
||||||
206
web/app.py
206
web/app.py
@@ -449,58 +449,146 @@ def main():
|
|||||||
st.markdown(f"**{tr('help.how')}**")
|
st.markdown(f"**{tr('help.how')}**")
|
||||||
st.markdown(tr("tts.how"))
|
st.markdown(tr("tts.how"))
|
||||||
|
|
||||||
# Get available TTS workflows
|
# Get TTS config
|
||||||
tts_workflows = pixelle_video.tts.list_workflows()
|
|
||||||
|
|
||||||
# Build options for selectbox
|
|
||||||
tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
|
|
||||||
tts_workflow_keys = [wf["key"] for wf in tts_workflows]
|
|
||||||
|
|
||||||
# Default to saved workflow if exists
|
|
||||||
default_tts_index = 0
|
|
||||||
comfyui_config = config_manager.get_comfyui_config()
|
comfyui_config = config_manager.get_comfyui_config()
|
||||||
saved_tts_workflow = comfyui_config["tts"]["default_workflow"]
|
tts_config = comfyui_config["tts"]
|
||||||
if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
|
|
||||||
default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
|
|
||||||
|
|
||||||
tts_workflow_display = st.selectbox(
|
# Inference mode selection
|
||||||
"TTS Workflow",
|
tts_mode = st.radio(
|
||||||
tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
|
tr("tts.inference_mode"),
|
||||||
index=default_tts_index,
|
["local", "comfyui"],
|
||||||
label_visibility="collapsed",
|
horizontal=True,
|
||||||
key="tts_workflow_select"
|
format_func=lambda x: tr(f"tts.mode.{x}"),
|
||||||
|
index=0 if tts_config.get("inference_mode", "local") == "local" else 1,
|
||||||
|
key="tts_inference_mode"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get the actual workflow key
|
# Show hint based on mode
|
||||||
if tts_workflow_options:
|
if tts_mode == "local":
|
||||||
tts_selected_index = tts_workflow_options.index(tts_workflow_display)
|
st.caption(tr("tts.mode.local_hint"))
|
||||||
tts_workflow_key = tts_workflow_keys[tts_selected_index]
|
|
||||||
else:
|
else:
|
||||||
tts_workflow_key = "selfhost/tts_edge.json" # fallback
|
st.caption(tr("tts.mode.comfyui_hint"))
|
||||||
|
|
||||||
# Reference audio upload (optional, for voice cloning)
|
# ================================================================
|
||||||
ref_audio_file = st.file_uploader(
|
# Local Mode UI
|
||||||
tr("tts.ref_audio"),
|
# ================================================================
|
||||||
type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
|
if tts_mode == "local":
|
||||||
help=tr("tts.ref_audio_help"),
|
# Import voice configuration
|
||||||
key="ref_audio_upload"
|
from pixelle_video.tts_voices import EDGE_TTS_VOICES, get_voice_display_name
|
||||||
)
|
|
||||||
|
|
||||||
# Save uploaded ref_audio to temp file if provided
|
|
||||||
ref_audio_path = None
|
|
||||||
if ref_audio_file is not None:
|
|
||||||
# Audio preview player (directly play uploaded file)
|
|
||||||
st.audio(ref_audio_file)
|
|
||||||
|
|
||||||
# Save to temp directory
|
# Get saved voice from config
|
||||||
import tempfile
|
local_config = tts_config.get("local", {})
|
||||||
temp_dir = Path("temp")
|
saved_voice = local_config.get("voice", "zh-CN-YunjianNeural")
|
||||||
temp_dir.mkdir(exist_ok=True)
|
saved_speed = local_config.get("speed", 1.2)
|
||||||
ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
|
|
||||||
with open(ref_audio_path, "wb") as f:
|
# Build voice options with i18n
|
||||||
f.write(ref_audio_file.getbuffer())
|
voice_options = []
|
||||||
|
voice_ids = []
|
||||||
|
default_voice_index = 0
|
||||||
|
|
||||||
|
for idx, voice_config in enumerate(EDGE_TTS_VOICES):
|
||||||
|
voice_id = voice_config["id"]
|
||||||
|
display_name = get_voice_display_name(voice_id, tr, get_language())
|
||||||
|
voice_options.append(display_name)
|
||||||
|
voice_ids.append(voice_id)
|
||||||
|
|
||||||
|
# Set default index if matches saved voice
|
||||||
|
if voice_id == saved_voice:
|
||||||
|
default_voice_index = idx
|
||||||
|
|
||||||
|
# Two-column layout: Voice | Speed
|
||||||
|
voice_col, speed_col = st.columns([1, 1])
|
||||||
|
|
||||||
|
with voice_col:
|
||||||
|
# Voice selector
|
||||||
|
selected_voice_display = st.selectbox(
|
||||||
|
tr("tts.voice_selector"),
|
||||||
|
voice_options,
|
||||||
|
index=default_voice_index,
|
||||||
|
key="tts_local_voice"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get actual voice ID
|
||||||
|
selected_voice_index = voice_options.index(selected_voice_display)
|
||||||
|
selected_voice = voice_ids[selected_voice_index]
|
||||||
|
|
||||||
|
with speed_col:
|
||||||
|
# Speed slider
|
||||||
|
tts_speed = st.slider(
|
||||||
|
tr("tts.speed"),
|
||||||
|
min_value=0.5,
|
||||||
|
max_value=2.0,
|
||||||
|
value=saved_speed,
|
||||||
|
step=0.1,
|
||||||
|
format="%.1fx",
|
||||||
|
key="tts_local_speed"
|
||||||
|
)
|
||||||
|
st.caption(tr("tts.speed_label", speed=f"{tts_speed:.1f}"))
|
||||||
|
|
||||||
|
# Variables for video generation
|
||||||
|
tts_workflow_key = None
|
||||||
|
ref_audio_path = None
|
||||||
|
|
||||||
# TTS preview expander (simplified, uses default voice and speed)
|
# ================================================================
|
||||||
|
# ComfyUI Mode UI
|
||||||
|
# ================================================================
|
||||||
|
else: # comfyui mode
|
||||||
|
# Get available TTS workflows
|
||||||
|
tts_workflows = pixelle_video.tts.list_workflows()
|
||||||
|
|
||||||
|
# Build options for selectbox
|
||||||
|
tts_workflow_options = [wf["display_name"] for wf in tts_workflows]
|
||||||
|
tts_workflow_keys = [wf["key"] for wf in tts_workflows]
|
||||||
|
|
||||||
|
# Default to saved workflow if exists
|
||||||
|
default_tts_index = 0
|
||||||
|
saved_tts_workflow = tts_config.get("comfyui", {}).get("default_workflow")
|
||||||
|
if saved_tts_workflow and saved_tts_workflow in tts_workflow_keys:
|
||||||
|
default_tts_index = tts_workflow_keys.index(saved_tts_workflow)
|
||||||
|
|
||||||
|
tts_workflow_display = st.selectbox(
|
||||||
|
"TTS Workflow",
|
||||||
|
tts_workflow_options if tts_workflow_options else ["No TTS workflows found"],
|
||||||
|
index=default_tts_index,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
key="tts_workflow_select"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the actual workflow key
|
||||||
|
if tts_workflow_options:
|
||||||
|
tts_selected_index = tts_workflow_options.index(tts_workflow_display)
|
||||||
|
tts_workflow_key = tts_workflow_keys[tts_selected_index]
|
||||||
|
else:
|
||||||
|
tts_workflow_key = "selfhost/tts_edge.json" # fallback
|
||||||
|
|
||||||
|
# Reference audio upload (optional, for voice cloning)
|
||||||
|
ref_audio_file = st.file_uploader(
|
||||||
|
tr("tts.ref_audio"),
|
||||||
|
type=["mp3", "wav", "flac", "m4a", "aac", "ogg"],
|
||||||
|
help=tr("tts.ref_audio_help"),
|
||||||
|
key="ref_audio_upload"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save uploaded ref_audio to temp file if provided
|
||||||
|
ref_audio_path = None
|
||||||
|
if ref_audio_file is not None:
|
||||||
|
# Audio preview player (directly play uploaded file)
|
||||||
|
st.audio(ref_audio_file)
|
||||||
|
|
||||||
|
# Save to temp directory
|
||||||
|
temp_dir = Path("temp")
|
||||||
|
temp_dir.mkdir(exist_ok=True)
|
||||||
|
ref_audio_path = temp_dir / f"ref_audio_{ref_audio_file.name}"
|
||||||
|
with open(ref_audio_path, "wb") as f:
|
||||||
|
f.write(ref_audio_file.getbuffer())
|
||||||
|
|
||||||
|
# Variables for video generation
|
||||||
|
selected_voice = None
|
||||||
|
tts_speed = None
|
||||||
|
|
||||||
|
# ================================================================
|
||||||
|
# TTS Preview (works for both modes)
|
||||||
|
# ================================================================
|
||||||
with st.expander(tr("tts.preview_title"), expanded=False):
|
with st.expander(tr("tts.preview_title"), expanded=False):
|
||||||
# Preview text input
|
# Preview text input
|
||||||
preview_text = st.text_input(
|
preview_text = st.text_input(
|
||||||
@@ -514,14 +602,19 @@ def main():
|
|||||||
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
|
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
|
||||||
with st.spinner(tr("tts.previewing")):
|
with st.spinner(tr("tts.previewing")):
|
||||||
try:
|
try:
|
||||||
# Generate preview audio using selected workflow (use default voice and speed)
|
# Build TTS params based on mode
|
||||||
# Pass ref_audio if uploaded
|
|
||||||
tts_params = {
|
tts_params = {
|
||||||
"text": preview_text,
|
"text": preview_text,
|
||||||
"workflow": tts_workflow_key
|
"inference_mode": tts_mode
|
||||||
}
|
}
|
||||||
if ref_audio_path:
|
|
||||||
tts_params["ref_audio"] = str(ref_audio_path)
|
if tts_mode == "local":
|
||||||
|
tts_params["voice"] = selected_voice
|
||||||
|
tts_params["speed"] = tts_speed
|
||||||
|
else: # comfyui
|
||||||
|
tts_params["workflow"] = tts_workflow_key
|
||||||
|
if ref_audio_path:
|
||||||
|
tts_params["ref_audio"] = str(ref_audio_path)
|
||||||
|
|
||||||
audio_path = run_async(pixelle_video.tts(**tts_params))
|
audio_path = run_async(pixelle_video.tts(**tts_params))
|
||||||
|
|
||||||
@@ -979,7 +1072,6 @@ def main():
|
|||||||
"mode": mode,
|
"mode": mode,
|
||||||
"title": title if title else None,
|
"title": title if title else None,
|
||||||
"n_scenes": n_scenes,
|
"n_scenes": n_scenes,
|
||||||
"tts_workflow": tts_workflow_key,
|
|
||||||
"image_workflow": workflow_key,
|
"image_workflow": workflow_key,
|
||||||
"image_width": int(image_width),
|
"image_width": int(image_width),
|
||||||
"image_height": int(image_height),
|
"image_height": int(image_height),
|
||||||
@@ -989,14 +1081,20 @@ def main():
|
|||||||
"progress_callback": update_progress,
|
"progress_callback": update_progress,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Add TTS parameters based on mode
|
||||||
|
video_params["tts_inference_mode"] = tts_mode
|
||||||
|
if tts_mode == "local":
|
||||||
|
video_params["tts_voice"] = selected_voice
|
||||||
|
video_params["tts_speed"] = tts_speed
|
||||||
|
else: # comfyui
|
||||||
|
video_params["tts_workflow"] = tts_workflow_key
|
||||||
|
if ref_audio_path:
|
||||||
|
video_params["ref_audio"] = str(ref_audio_path)
|
||||||
|
|
||||||
# Add custom template parameters if any
|
# Add custom template parameters if any
|
||||||
if custom_values_for_video:
|
if custom_values_for_video:
|
||||||
video_params["template_params"] = custom_values_for_video
|
video_params["template_params"] = custom_values_for_video
|
||||||
|
|
||||||
# Add ref_audio if uploaded
|
|
||||||
if ref_audio_path:
|
|
||||||
video_params["ref_audio"] = str(ref_audio_path)
|
|
||||||
|
|
||||||
result = run_async(pixelle_video.generate_video(**video_params))
|
result = run_async(pixelle_video.generate_video(**video_params))
|
||||||
|
|
||||||
progress_bar.progress(100)
|
progress_bar.progress(100)
|
||||||
|
|||||||
@@ -179,6 +179,31 @@
|
|||||||
"settings.comfyui.runninghub_api_key": "RunningHub API Key",
|
"settings.comfyui.runninghub_api_key": "RunningHub API Key",
|
||||||
"settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key",
|
"settings.comfyui.runninghub_api_key_help": "Visit https://runninghub.ai to register and get API Key",
|
||||||
|
|
||||||
|
"tts.inference_mode": "Synthesis Mode",
|
||||||
|
"tts.mode.local": "Local Synthesis",
|
||||||
|
"tts.mode.comfyui": "ComfyUI Synthesis",
|
||||||
|
"tts.mode.local_hint": "💡 Using Edge TTS, no configuration required, ready to use",
|
||||||
|
"tts.mode.comfyui_hint": "⚙️ Using ComfyUI workflows, flexible and powerful",
|
||||||
|
|
||||||
|
"tts.voice_selector": "Voice Selection",
|
||||||
|
"tts.speed": "Speed",
|
||||||
|
"tts.speed_label": "{speed}x",
|
||||||
|
|
||||||
|
"tts.voice.zh_CN_XiaoxiaoNeural": "zh-CN-XiaoxiaoNeural",
|
||||||
|
"tts.voice.zh_CN_XiaoyiNeural": "zh-CN-XiaoyiNeural",
|
||||||
|
"tts.voice.zh_CN_YunjianNeural": "zh-CN-YunjianNeural",
|
||||||
|
"tts.voice.zh_CN_YunxiNeural": "zh-CN-YunxiNeural",
|
||||||
|
"tts.voice.zh_CN_YunyangNeural": "zh-CN-YunyangNeural",
|
||||||
|
"tts.voice.zh_CN_YunyeNeural": "zh-CN-YunyeNeural",
|
||||||
|
"tts.voice.zh_CN_YunfengNeural": "zh-CN-YunfengNeural",
|
||||||
|
"tts.voice.zh_CN_liaoning_XiaobeiNeural": "zh-CN-liaoning-XiaobeiNeural",
|
||||||
|
"tts.voice.en_US_AriaNeural": "en-US-AriaNeural",
|
||||||
|
"tts.voice.en_US_JennyNeural": "en-US-JennyNeural",
|
||||||
|
"tts.voice.en_US_GuyNeural": "en-US-GuyNeural",
|
||||||
|
"tts.voice.en_US_DavisNeural": "en-US-DavisNeural",
|
||||||
|
"tts.voice.en_GB_SoniaNeural": "en-GB-SoniaNeural",
|
||||||
|
"tts.voice.en_GB_RyanNeural": "en-GB-RyanNeural",
|
||||||
|
|
||||||
"tts.selector": "Workflow Selection",
|
"tts.selector": "Workflow Selection",
|
||||||
"tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)",
|
"tts.what": "Converts narration text to natural human-like speech (some workflows support reference audio for voice cloning)",
|
||||||
"tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder",
|
"tts.how": "Place tts_xxx.json workflow files in workflows/selfhost/ (local ComfyUI) or workflows/runninghub/ (cloud) folder",
|
||||||
|
|||||||
@@ -179,6 +179,31 @@
|
|||||||
"settings.comfyui.runninghub_api_key": "RunningHub API 密钥",
|
"settings.comfyui.runninghub_api_key": "RunningHub API 密钥",
|
||||||
"settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key",
|
"settings.comfyui.runninghub_api_key_help": "访问 https://runninghub.ai 注册并获取 API Key",
|
||||||
|
|
||||||
|
"tts.inference_mode": "合成方式",
|
||||||
|
"tts.mode.local": "本地合成",
|
||||||
|
"tts.mode.comfyui": "ComfyUI 合成",
|
||||||
|
"tts.mode.local_hint": "💡 使用 Edge TTS,无需配置,开箱即用(请确保网络环境可用)",
|
||||||
|
"tts.mode.comfyui_hint": "⚙️ 使用 ComfyUI 工作流,灵活强大",
|
||||||
|
|
||||||
|
"tts.voice_selector": "音色选择",
|
||||||
|
"tts.speed": "语速",
|
||||||
|
"tts.speed_label": "{speed}x",
|
||||||
|
|
||||||
|
"tts.voice.zh_CN_XiaoxiaoNeural": "女声-温柔(晓晓)",
|
||||||
|
"tts.voice.zh_CN_XiaoyiNeural": "女声-甜美(晓伊)",
|
||||||
|
"tts.voice.zh_CN_YunjianNeural": "男声-专业(云健)",
|
||||||
|
"tts.voice.zh_CN_YunxiNeural": "男声-磁性(云希)",
|
||||||
|
"tts.voice.zh_CN_YunyangNeural": "男声-新闻(云扬)",
|
||||||
|
"tts.voice.zh_CN_YunyeNeural": "男声-自然(云野)",
|
||||||
|
"tts.voice.zh_CN_YunfengNeural": "男声-沉稳(云锋)",
|
||||||
|
"tts.voice.zh_CN_liaoning_XiaobeiNeural": "女声-东北(小北)",
|
||||||
|
"tts.voice.en_US_AriaNeural": "女声-自然(Aria)",
|
||||||
|
"tts.voice.en_US_JennyNeural": "女声-温暖(Jenny)",
|
||||||
|
"tts.voice.en_US_GuyNeural": "男声-标准(Guy)",
|
||||||
|
"tts.voice.en_US_DavisNeural": "男声-友好(Davis)",
|
||||||
|
"tts.voice.en_GB_SoniaNeural": "女声-英式(Sonia)",
|
||||||
|
"tts.voice.en_GB_RyanNeural": "男声-英式(Ryan)",
|
||||||
|
|
||||||
"tts.selector": "工作流选择",
|
"tts.selector": "工作流选择",
|
||||||
"tts.what": "将旁白文本转换为真人般的自然语音(部分工作流支持参考音频克隆声音)",
|
"tts.what": "将旁白文本转换为真人般的自然语音(部分工作流支持参考音频克隆声音)",
|
||||||
"tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/(本地 ComfyUI)或 workflows/runninghub/(云端)文件夹",
|
"tts.how": "将 tts_xxx.json 工作流文件放入 workflows/selfhost/(本地 ComfyUI)或 workflows/runninghub/(云端)文件夹",
|
||||||
|
|||||||
Reference in New Issue
Block a user