tts支持本地合成

This commit is contained in:
puke
2025-11-06 21:06:14 +08:00
parent 56b6b74af7
commit 393cdb8f0a
9 changed files with 531 additions and 112 deletions

View File

@@ -14,9 +14,28 @@ class LLMConfig(BaseModel):
model: str = Field(default="", description="LLM Model Name")
class TTSLocalConfig(BaseModel):
"""Local TTS configuration (Edge TTS)"""
voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID")
speed: float = Field(default=1.2, ge=0.5, le=2.0, description="Speech speed multiplier (0.5-2.0)")
class TTSComfyUIConfig(BaseModel):
"""ComfyUI TTS configuration"""
default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
class TTSSubConfig(BaseModel):
"""TTS-specific configuration (under comfyui.tts)"""
default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
inference_mode: str = Field(default="local", description="TTS inference mode: 'local' or 'comfyui'")
local: TTSLocalConfig = Field(default_factory=TTSLocalConfig, description="Local TTS (Edge TTS) configuration")
comfyui: TTSComfyUIConfig = Field(default_factory=TTSComfyUIConfig, description="ComfyUI TTS configuration")
# Backward compatibility: keep default_workflow at top level
@property
def default_workflow(self) -> Optional[str]:
"""Get default workflow (for backward compatibility)"""
return self.comfyui.default_workflow
class ImageSubConfig(BaseModel):

View File

@@ -24,10 +24,11 @@ class StoryboardConfig:
video_fps: int = 30 # Frame rate
# Audio parameters
voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice
tts_workflow: Optional[str] = None # TTS workflow filename (None = use default)
tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster)
ref_audio: Optional[str] = None # Reference audio for voice cloning (only some workflows support this)
tts_inference_mode: str = "local" # TTS inference mode: "local" or "comfyui"
voice_id: Optional[str] = None # Voice ID (for local: Edge TTS voice ID; for comfyui: workflow-specific)
tts_workflow: Optional[str] = None # TTS workflow filename (for ComfyUI mode, None = use default)
tts_speed: Optional[float] = None # TTS speed multiplier (0.5-2.0, 1.0 = normal)
ref_audio: Optional[str] = None # Reference audio for voice cloning (ComfyUI mode only)
# Image parameters
image_width: int = 1024

View File

@@ -62,10 +62,17 @@ class StandardPipeline(BasePipeline):
# === Basic Config ===
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
voice_id: str = "[Chinese] zh-CN Yunjian",
tts_workflow: Optional[str] = None,
tts_speed: float = 1.2,
ref_audio: Optional[str] = None, # Reference audio for voice cloning
# === TTS Parameters ===
tts_inference_mode: Optional[str] = None, # "local" or "comfyui"
tts_voice: Optional[str] = None, # For local mode: Edge TTS voice ID
tts_speed: Optional[float] = None, # Speed multiplier (0.5-2.0)
tts_workflow: Optional[str] = None, # For ComfyUI mode: workflow path
ref_audio: Optional[str] = None, # For ComfyUI mode: reference audio
# Deprecated (kept for backward compatibility)
voice_id: Optional[str] = None,
output_path: Optional[str] = None,
# === LLM Parameters ===
@@ -191,6 +198,29 @@ class StandardPipeline(BasePipeline):
output_path = get_task_final_video_path(task_id)
logger.info(f" Will copy final video to: {user_specified_output}")
# Determine TTS inference mode and parameters
# Priority: explicit params > backward compatibility > config defaults
if tts_inference_mode is None:
# Check if user provided ComfyUI-specific params
if tts_workflow is not None or ref_audio is not None:
tts_inference_mode = "comfyui"
# Check if user provided old voice_id param (backward compatibility)
elif voice_id is not None:
tts_inference_mode = "comfyui"
if tts_voice is None:
tts_voice = voice_id
else:
# Use config default
tts_config = self.core.config.get("comfyui", {}).get("tts", {})
tts_inference_mode = tts_config.get("inference_mode", "local")
# Set voice_id based on mode for StoryboardConfig
final_voice_id = None
if tts_inference_mode == "local":
final_voice_id = tts_voice or voice_id
else: # comfyui
final_voice_id = voice_id # For ComfyUI, might be None
# Create storyboard config
config = StoryboardConfig(
task_id=task_id,
@@ -200,7 +230,8 @@ class StandardPipeline(BasePipeline):
min_image_prompt_words=min_image_prompt_words,
max_image_prompt_words=max_image_prompt_words,
video_fps=video_fps,
voice_id=voice_id,
tts_inference_mode=tts_inference_mode,
voice_id=final_voice_id,
tts_workflow=tts_workflow,
tts_speed=tts_speed,
ref_audio=ref_audio,

View File

@@ -124,18 +124,29 @@ class FrameProcessor:
from pixelle_video.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(config.task_id, frame.index, "audio")
# Call TTS with specific output path and workflow
# Build TTS params based on inference mode
tts_params = {
"text": frame.narration,
"workflow": config.tts_workflow,
"voice": config.voice_id,
"speed": config.tts_speed,
"inference_mode": config.tts_inference_mode,
"output_path": output_path,
}
# Add ref_audio if provided
if config.ref_audio:
tts_params["ref_audio"] = config.ref_audio
if config.tts_inference_mode == "local":
# Local mode: pass voice and speed
if config.voice_id:
tts_params["voice"] = config.voice_id
if config.tts_speed is not None:
tts_params["speed"] = config.tts_speed
else: # comfyui
# ComfyUI mode: pass workflow, voice, speed, and ref_audio
if config.tts_workflow:
tts_params["workflow"] = config.tts_workflow
if config.voice_id:
tts_params["voice"] = config.voice_id
if config.tts_speed is not None:
tts_params["speed"] = config.tts_speed
if config.ref_audio:
tts_params["ref_audio"] = config.ref_audio
audio_path = await self.core.tts(**tts_params)

View File

@@ -1,13 +1,18 @@
"""
TTS (Text-to-Speech) Service - ComfyUI Workflow-based implementation
TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
"""
import os
import uuid
from pathlib import Path
from typing import Optional
from comfykit import ComfyKit
from loguru import logger
from pixelle_video.services.comfy_base_service import ComfyBaseService
from pixelle_video.utils.tts_util import edge_tts
from pixelle_video.tts_voices import speed_to_rate
class TTSService(ComfyBaseService):
@@ -52,22 +57,25 @@ class TTSService(ComfyBaseService):
comfyui_url: Optional[str] = None,
runninghub_api_key: Optional[str] = None,
# TTS parameters
voice: str = "[Chinese] zh-CN Yunjian",
speed: float = 1.2,
voice: Optional[str] = None,
speed: Optional[float] = None,
# Inference mode override
inference_mode: Optional[str] = None,
# Output path
output_path: Optional[str] = None,
**params
) -> str:
"""
Generate speech using ComfyUI workflow
Generate speech using local Edge TTS or ComfyUI workflow
Args:
text: Text to convert to speech
workflow: Workflow filename (default: from config)
workflow: Workflow filename (for ComfyUI mode, default: from config)
comfyui_url: ComfyUI URL (optional, overrides config)
runninghub_api_key: RunningHub API key (optional, overrides config)
voice: Voice ID (workflow-specific)
voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
inference_mode: Override inference mode ("local" or "comfyui", default: from config)
output_path: Custom output path (auto-generated if None)
**params: Additional workflow parameters
@@ -75,49 +83,103 @@ class TTSService(ComfyBaseService):
Generated audio file path
Examples:
# Simplest: use default workflow
audio_path = await pixelle_video.tts(text="Hello, world!")
# Use specific workflow
# Local inference (Edge TTS)
audio_path = await pixelle_video.tts(
text="你好,世界!",
workflow="tts_edge.json"
)
# With voice and speed
audio_path = await pixelle_video.tts(
text="Hello",
workflow="tts_edge.json",
voice="[Chinese] zh-CN Xiaoxiao",
text="Hello, world!",
inference_mode="local",
voice="zh-CN-YunjianNeural",
speed=1.2
)
# With absolute path
# ComfyUI inference
audio_path = await pixelle_video.tts(
text="Hello",
workflow="/path/to/custom_tts.json"
)
# With custom ComfyUI server
audio_path = await pixelle_video.tts(
text="Hello",
comfyui_url="http://192.168.1.100:8188"
text="你好,世界!",
inference_mode="comfyui",
workflow="runninghub/tts_edge.json"
)
"""
# 1. Resolve workflow (returns structured info)
workflow_info = self._resolve_workflow(workflow=workflow)
# Determine inference mode (param > config)
mode = inference_mode or self.config.get("inference_mode", "local")
# 2. Execute ComfyUI workflow
return await self._call_comfyui_workflow(
workflow_info=workflow_info,
text=text,
comfyui_url=comfyui_url,
runninghub_api_key=runninghub_api_key,
voice=voice,
speed=speed,
output_path=output_path,
**params
)
# Route to appropriate implementation
if mode == "local":
return await self._call_local_tts(
text=text,
voice=voice,
speed=speed,
output_path=output_path
)
else: # comfyui
# 1. Resolve workflow (returns structured info)
workflow_info = self._resolve_workflow(workflow=workflow)
# 2. Execute ComfyUI workflow
return await self._call_comfyui_workflow(
workflow_info=workflow_info,
text=text,
comfyui_url=comfyui_url,
runninghub_api_key=runninghub_api_key,
voice=voice,
speed=speed,
output_path=output_path,
**params
)
async def _call_local_tts(
self,
text: str,
voice: Optional[str] = None,
speed: Optional[float] = None,
output_path: Optional[str] = None,
) -> str:
"""
Generate speech using local Edge TTS
Args:
text: Text to convert to speech
voice: Edge TTS voice ID (default: from config)
speed: Speech speed multiplier (default: from config)
output_path: Custom output path (auto-generated if None)
Returns:
Generated audio file path
"""
# Get config defaults
local_config = self.config.get("local", {})
# Determine voice and speed (param > config)
final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
final_speed = speed if speed is not None else local_config.get("speed", 1.2)
# Convert speed to rate parameter
rate = speed_to_rate(final_speed)
logger.info(f"🎙️ Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
# Generate output path if not provided
if not output_path:
# Generate unique filename
unique_id = uuid.uuid4().hex
output_path = f"output/{unique_id}.mp3"
# Ensure output directory exists
Path("output").mkdir(parents=True, exist_ok=True)
# Call Edge TTS
try:
audio_bytes = await edge_tts(
text=text,
voice=final_voice,
rate=rate,
output_path=output_path
)
logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
return output_path
except Exception as e:
logger.error(f"Local TTS generation error: {e}")
raise
async def _call_comfyui_workflow(
self,

147
pixelle_video/tts_voices.py Normal file
View File

@@ -0,0 +1,147 @@
"""
TTS Voice Configuration
Defines available voices for local Edge TTS inference.
"""
from typing import List, Dict, Any
# Edge TTS voice presets for local inference
EDGE_TTS_VOICES: List[Dict[str, Any]] = [
# Chinese voices
{
"id": "zh-CN-XiaoxiaoNeural",
"label_key": "tts.voice.zh_CN_XiaoxiaoNeural",
"locale": "zh-CN",
"gender": "female"
},
{
"id": "zh-CN-XiaoyiNeural",
"label_key": "tts.voice.zh_CN_XiaoyiNeural",
"locale": "zh-CN",
"gender": "female"
},
{
"id": "zh-CN-YunjianNeural",
"label_key": "tts.voice.zh_CN_YunjianNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunxiNeural",
"label_key": "tts.voice.zh_CN_YunxiNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunyangNeural",
"label_key": "tts.voice.zh_CN_YunyangNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunyeNeural",
"label_key": "tts.voice.zh_CN_YunyeNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-YunfengNeural",
"label_key": "tts.voice.zh_CN_YunfengNeural",
"locale": "zh-CN",
"gender": "male"
},
{
"id": "zh-CN-liaoning-XiaobeiNeural",
"label_key": "tts.voice.zh_CN_liaoning_XiaobeiNeural",
"locale": "zh-CN",
"gender": "female"
},
# English voices
{
"id": "en-US-AriaNeural",
"label_key": "tts.voice.en_US_AriaNeural",
"locale": "en-US",
"gender": "female"
},
{
"id": "en-US-JennyNeural",
"label_key": "tts.voice.en_US_JennyNeural",
"locale": "en-US",
"gender": "female"
},
{
"id": "en-US-GuyNeural",
"label_key": "tts.voice.en_US_GuyNeural",
"locale": "en-US",
"gender": "male"
},
{
"id": "en-US-DavisNeural",
"label_key": "tts.voice.en_US_DavisNeural",
"locale": "en-US",
"gender": "male"
},
{
"id": "en-GB-SoniaNeural",
"label_key": "tts.voice.en_GB_SoniaNeural",
"locale": "en-GB",
"gender": "female"
},
{
"id": "en-GB-RyanNeural",
"label_key": "tts.voice.en_GB_RyanNeural",
"locale": "en-GB",
"gender": "male"
},
]
def get_voice_display_name(voice_id: str, tr_func=None, locale: str = "zh_CN") -> str:
"""
Get display name for voice
Args:
voice_id: Voice ID (e.g., "zh-CN-YunjianNeural")
tr_func: Translation function (optional)
locale: Current locale (default: "zh_CN")
Returns:
Display name (translated label if in Chinese, otherwise voice ID)
"""
# Find voice config
voice_config = next((v for v in EDGE_TTS_VOICES if v["id"] == voice_id), None)
if not voice_config:
return voice_id
# If Chinese locale and translation function available, use translated label
if locale == "zh_CN" and tr_func:
label_key = voice_config["label_key"]
return tr_func(label_key)
# For other locales, return voice ID
return voice_id
def speed_to_rate(speed: float) -> str:
"""
Convert speed multiplier to Edge TTS rate parameter
Args:
speed: Speed multiplier (1.0 = normal, 1.2 = 120%)
Returns:
Rate string (e.g., "+20%", "-10%")
Examples:
1.0 → "+0%"
1.2 → "+20%"
0.8 → "-20%"
"""
percentage = int((speed - 1.0) * 100)
sign = "+" if percentage >= 0 else ""
return f"{sign}{percentage}%"