312 lines
10 KiB
Python
312 lines
10 KiB
Python
"""
|
|
TTS (Text-to-Speech) Service - Dual implementation (Edge TTS + ComfyUI)
|
|
"""
|
|
|
|
import uuid
|
|
from typing import Optional
|
|
|
|
from comfykit import ComfyKit
|
|
from loguru import logger
|
|
|
|
from reelforge.services.comfy_base_service import ComfyBaseService
|
|
from reelforge.utils.os_util import get_temp_path
|
|
|
|
|
|
class TTSService(ComfyBaseService):
|
|
"""
|
|
TTS (Text-to-Speech) service - Dual implementation
|
|
|
|
Supports two TTS methods:
|
|
1. Edge TTS (default) - Free, local SDK, no workflow needed
|
|
2. ComfyUI Workflow - Workflow-based, requires ComfyUI setup
|
|
|
|
Usage:
|
|
# Use default (edge-tts)
|
|
audio_path = await reelforge.tts(text="Hello, world!")
|
|
|
|
# Explicitly use edge-tts
|
|
audio_path = await reelforge.tts(
|
|
text="你好,世界!",
|
|
workflow="edge"
|
|
)
|
|
|
|
# Use ComfyUI workflow
|
|
audio_path = await reelforge.tts(
|
|
text="Hello",
|
|
workflow="tts_comfyui.json"
|
|
)
|
|
|
|
# List available workflows
|
|
workflows = reelforge.tts.list_workflows()
|
|
"""
|
|
|
|
WORKFLOW_PREFIX = "tts_"
|
|
DEFAULT_WORKFLOW = "edge" # Default to edge-tts
|
|
WORKFLOWS_DIR = "workflows"
|
|
|
|
# Built-in providers (not workflow files)
|
|
BUILTIN_PROVIDERS = ["edge", "edge-tts"]
|
|
|
|
def __init__(self, config: dict):
|
|
"""
|
|
Initialize TTS service
|
|
|
|
Args:
|
|
config: Full application config dict
|
|
"""
|
|
super().__init__(config, service_name="tts")
|
|
|
|
def _resolve_workflow(self, workflow: Optional[str] = None) -> str:
|
|
"""
|
|
Resolve workflow to actual workflow path or provider name
|
|
|
|
Args:
|
|
workflow: Workflow filename or provider name (e.g., "edge", "tts_default.json")
|
|
|
|
Returns:
|
|
Workflow file path or provider name
|
|
"""
|
|
# 1. If not specified, use default
|
|
if workflow is None:
|
|
workflow = self._get_default_workflow()
|
|
|
|
# 2. If it's a built-in provider, return as-is
|
|
if workflow in self.BUILTIN_PROVIDERS:
|
|
logger.debug(f"Using built-in TTS provider: {workflow}")
|
|
return workflow
|
|
|
|
# 3. Otherwise, treat as workflow file (use parent logic)
|
|
return super()._resolve_workflow(workflow)
|
|
|
|
async def __call__(
|
|
self,
|
|
text: str,
|
|
workflow: Optional[str] = None,
|
|
# ComfyUI connection (optional overrides, only for workflow mode)
|
|
comfyui_url: Optional[str] = None,
|
|
runninghub_api_key: Optional[str] = None,
|
|
# Common TTS parameters (work for both edge-tts and workflows)
|
|
voice: Optional[str] = None,
|
|
rate: Optional[str] = None,
|
|
volume: Optional[str] = None,
|
|
pitch: Optional[str] = None,
|
|
**params
|
|
) -> str:
|
|
"""
|
|
Generate speech using edge-tts or ComfyUI workflow
|
|
|
|
Args:
|
|
text: Text to convert to speech
|
|
workflow: Workflow filename or provider name (default: "edge")
|
|
- "edge" or "edge-tts": Use local edge-tts SDK
|
|
- "tts_xxx.json": Use ComfyUI workflow
|
|
- Absolute path/URL/RunningHub ID: Also supported
|
|
comfyui_url: ComfyUI URL (only for workflow mode)
|
|
runninghub_api_key: RunningHub API key (only for workflow mode)
|
|
voice: Voice ID
|
|
rate: Speech rate (e.g., "+0%", "+50%", "-20%")
|
|
volume: Speech volume (e.g., "+0%")
|
|
pitch: Speech pitch (e.g., "+0Hz")
|
|
**params: Additional parameters
|
|
|
|
Returns:
|
|
Generated audio file path
|
|
|
|
Examples:
|
|
# Simplest: use default (edge-tts)
|
|
audio_path = await reelforge.tts(text="Hello, world!")
|
|
|
|
# Explicitly use edge-tts with parameters
|
|
audio_path = await reelforge.tts(
|
|
text="你好,世界!",
|
|
workflow="edge",
|
|
voice="zh-CN-XiaoxiaoNeural",
|
|
rate="+20%"
|
|
)
|
|
|
|
# Use ComfyUI workflow
|
|
audio_path = await reelforge.tts(
|
|
text="Hello",
|
|
workflow="tts_default.json"
|
|
)
|
|
|
|
# With absolute path
|
|
audio_path = await reelforge.tts(
|
|
text="Hello",
|
|
workflow="/path/to/custom_tts.json"
|
|
)
|
|
"""
|
|
# 1. Resolve workflow path or provider
|
|
workflow_or_provider = self._resolve_workflow(workflow=workflow)
|
|
|
|
# 2. Determine execution path
|
|
if workflow_or_provider in self.BUILTIN_PROVIDERS:
|
|
# Use edge-tts
|
|
return await self._call_edge_tts(
|
|
text=text,
|
|
voice=voice,
|
|
rate=rate,
|
|
volume=volume,
|
|
pitch=pitch,
|
|
**params
|
|
)
|
|
else:
|
|
# Use ComfyUI workflow
|
|
return await self._call_comfyui_workflow(
|
|
workflow_path=workflow_or_provider,
|
|
text=text,
|
|
comfyui_url=comfyui_url,
|
|
runninghub_api_key=runninghub_api_key,
|
|
voice=voice,
|
|
rate=rate,
|
|
volume=volume,
|
|
pitch=pitch,
|
|
**params
|
|
)
|
|
|
|
async def _call_edge_tts(
|
|
self,
|
|
text: str,
|
|
voice: Optional[str] = None,
|
|
rate: Optional[str] = None,
|
|
volume: Optional[str] = None,
|
|
pitch: Optional[str] = None,
|
|
**params
|
|
) -> str:
|
|
"""
|
|
Generate speech using edge-tts SDK
|
|
|
|
Args:
|
|
text: Text to convert to speech
|
|
voice: Voice ID (default: zh-CN-YunjianNeural)
|
|
rate: Speech rate (default: +0%)
|
|
volume: Speech volume (default: +0%)
|
|
pitch: Speech pitch (default: +0Hz)
|
|
**params: Additional parameters (e.g., retry_count, retry_delay)
|
|
|
|
Returns:
|
|
Generated audio file path
|
|
"""
|
|
from reelforge.utils.tts_util import edge_tts
|
|
|
|
logger.info(f"🎙️ Using edge-tts (local SDK)")
|
|
|
|
# Generate temp file path
|
|
output_path = get_temp_path(f"{uuid.uuid4().hex}.mp3")
|
|
|
|
# Call edge-tts with output_path to save directly
|
|
try:
|
|
audio_bytes = await edge_tts(
|
|
text=text,
|
|
voice=voice or "zh-CN-YunjianNeural",
|
|
rate=rate or "+0%",
|
|
volume=volume or "+0%",
|
|
pitch=pitch or "+0Hz",
|
|
output_path=output_path,
|
|
**params
|
|
)
|
|
|
|
logger.info(f"✅ Generated audio (edge-tts): {output_path}")
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Edge TTS generation error: {e}")
|
|
raise
|
|
|
|
async def _call_comfyui_workflow(
|
|
self,
|
|
workflow_path: str,
|
|
text: str,
|
|
comfyui_url: Optional[str] = None,
|
|
runninghub_api_key: Optional[str] = None,
|
|
voice: Optional[str] = None,
|
|
rate: Optional[str] = None,
|
|
volume: Optional[str] = None,
|
|
pitch: Optional[str] = None,
|
|
**params
|
|
) -> str:
|
|
"""
|
|
Generate speech using ComfyUI workflow
|
|
|
|
Args:
|
|
workflow_path: Path to workflow file
|
|
text: Text to convert to speech
|
|
comfyui_url: ComfyUI URL
|
|
runninghub_api_key: RunningHub API key
|
|
voice: Voice ID (workflow-specific)
|
|
rate: Speech rate (workflow-specific)
|
|
volume: Speech volume (workflow-specific)
|
|
pitch: Speech pitch (workflow-specific)
|
|
**params: Additional workflow parameters
|
|
|
|
Returns:
|
|
Generated audio file path/URL
|
|
"""
|
|
logger.info(f"🎙️ Using ComfyUI workflow: {workflow_path}")
|
|
|
|
# 1. Prepare ComfyKit config
|
|
kit_config = self._prepare_comfykit_config(
|
|
comfyui_url=comfyui_url,
|
|
runninghub_api_key=runninghub_api_key
|
|
)
|
|
|
|
# 2. Build workflow parameters
|
|
workflow_params = {"text": text}
|
|
|
|
# Add optional TTS parameters
|
|
if voice is not None:
|
|
workflow_params["voice"] = voice
|
|
if rate is not None:
|
|
workflow_params["rate"] = rate
|
|
if volume is not None:
|
|
workflow_params["volume"] = volume
|
|
if pitch is not None:
|
|
workflow_params["pitch"] = pitch
|
|
|
|
# Add any additional parameters
|
|
workflow_params.update(params)
|
|
|
|
logger.debug(f"Workflow parameters: {workflow_params}")
|
|
|
|
# 3. Execute workflow
|
|
try:
|
|
kit = ComfyKit(**kit_config)
|
|
|
|
logger.info(f"Executing TTS workflow: {workflow_path}")
|
|
result = await kit.execute(workflow_path, workflow_params)
|
|
|
|
# 4. Handle result
|
|
if result.status != "completed":
|
|
error_msg = result.msg or "Unknown error"
|
|
logger.error(f"TTS generation failed: {error_msg}")
|
|
raise Exception(f"TTS generation failed: {error_msg}")
|
|
|
|
# ComfyKit result can have audio files in different output types
|
|
# Try to get audio file path from result
|
|
audio_path = None
|
|
|
|
# Check for audio files in result.audios (if available)
|
|
if hasattr(result, 'audios') and result.audios:
|
|
audio_path = result.audios[0]
|
|
# Check for files in result.files
|
|
elif hasattr(result, 'files') and result.files:
|
|
audio_path = result.files[0]
|
|
# Check in outputs dictionary
|
|
elif hasattr(result, 'outputs') and result.outputs:
|
|
# Try to find audio file in outputs
|
|
for key, value in result.outputs.items():
|
|
if isinstance(value, str) and any(value.endswith(ext) for ext in ['.mp3', '.wav', '.flac']):
|
|
audio_path = value
|
|
break
|
|
|
|
if not audio_path:
|
|
logger.error("No audio file generated")
|
|
raise Exception("No audio file generated by workflow")
|
|
|
|
logger.info(f"✅ Generated audio (ComfyUI): {audio_path}")
|
|
return audio_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"TTS generation error: {e}")
|
|
raise
|