302 lines
11 KiB
Python
302 lines
11 KiB
Python
"""
|
|
TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
|
|
"""
|
|
|
|
import os
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
from comfykit import ComfyKit
|
|
from loguru import logger
|
|
|
|
from pixelle_video.services.comfy_base_service import ComfyBaseService
|
|
from pixelle_video.utils.tts_util import edge_tts
|
|
from pixelle_video.tts_voices import speed_to_rate
|
|
|
|
|
|
class TTSService(ComfyBaseService):
|
|
"""
|
|
TTS (Text-to-Speech) service - Workflow-based
|
|
|
|
Uses ComfyKit to execute TTS workflows.
|
|
|
|
Usage:
|
|
# Use default workflow
|
|
audio_path = await pixelle_video.tts(text="Hello, world!")
|
|
|
|
# Use specific workflow
|
|
audio_path = await pixelle_video.tts(
|
|
text="你好,世界!",
|
|
workflow="tts_edge.json"
|
|
)
|
|
|
|
# List available workflows
|
|
workflows = pixelle_video.tts.list_workflows()
|
|
"""
|
|
|
|
WORKFLOW_PREFIX = "tts_"
|
|
DEFAULT_WORKFLOW = None # No hardcoded default, must be configured
|
|
WORKFLOWS_DIR = "workflows"
|
|
|
|
def __init__(self, config: dict):
|
|
"""
|
|
Initialize TTS service
|
|
|
|
Args:
|
|
config: Full application config dict
|
|
"""
|
|
super().__init__(config, service_name="tts")
|
|
|
|
|
|
async def __call__(
|
|
self,
|
|
text: str,
|
|
workflow: Optional[str] = None,
|
|
# ComfyUI connection (optional overrides)
|
|
comfyui_url: Optional[str] = None,
|
|
runninghub_api_key: Optional[str] = None,
|
|
# TTS parameters
|
|
voice: Optional[str] = None,
|
|
speed: Optional[float] = None,
|
|
# Inference mode override
|
|
inference_mode: Optional[str] = None,
|
|
# Output path
|
|
output_path: Optional[str] = None,
|
|
**params
|
|
) -> str:
|
|
"""
|
|
Generate speech using local Edge TTS or ComfyUI workflow
|
|
|
|
Args:
|
|
text: Text to convert to speech
|
|
workflow: Workflow filename (for ComfyUI mode, default: from config)
|
|
comfyui_url: ComfyUI URL (optional, overrides config)
|
|
runninghub_api_key: RunningHub API key (optional, overrides config)
|
|
voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
|
|
speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
|
|
inference_mode: Override inference mode ("local" or "comfyui", default: from config)
|
|
output_path: Custom output path (auto-generated if None)
|
|
**params: Additional workflow parameters
|
|
|
|
Returns:
|
|
Generated audio file path
|
|
|
|
Examples:
|
|
# Local inference (Edge TTS)
|
|
audio_path = await pixelle_video.tts(
|
|
text="Hello, world!",
|
|
inference_mode="local",
|
|
voice="zh-CN-YunjianNeural",
|
|
speed=1.2
|
|
)
|
|
|
|
# ComfyUI inference
|
|
audio_path = await pixelle_video.tts(
|
|
text="你好,世界!",
|
|
inference_mode="comfyui",
|
|
workflow="runninghub/tts_edge.json"
|
|
)
|
|
"""
|
|
# Determine inference mode (param > config)
|
|
mode = inference_mode or self.config.get("inference_mode", "local")
|
|
|
|
# Route to appropriate implementation
|
|
if mode == "local":
|
|
return await self._call_local_tts(
|
|
text=text,
|
|
voice=voice,
|
|
speed=speed,
|
|
output_path=output_path
|
|
)
|
|
else: # comfyui
|
|
# 1. Resolve workflow (returns structured info)
|
|
workflow_info = self._resolve_workflow(workflow=workflow)
|
|
|
|
# 2. Execute ComfyUI workflow
|
|
return await self._call_comfyui_workflow(
|
|
workflow_info=workflow_info,
|
|
text=text,
|
|
comfyui_url=comfyui_url,
|
|
runninghub_api_key=runninghub_api_key,
|
|
voice=voice,
|
|
speed=speed,
|
|
output_path=output_path,
|
|
**params
|
|
)
|
|
|
|
async def _call_local_tts(
|
|
self,
|
|
text: str,
|
|
voice: Optional[str] = None,
|
|
speed: Optional[float] = None,
|
|
output_path: Optional[str] = None,
|
|
) -> str:
|
|
"""
|
|
Generate speech using local Edge TTS
|
|
|
|
Args:
|
|
text: Text to convert to speech
|
|
voice: Edge TTS voice ID (default: from config)
|
|
speed: Speech speed multiplier (default: from config)
|
|
output_path: Custom output path (auto-generated if None)
|
|
|
|
Returns:
|
|
Generated audio file path
|
|
"""
|
|
# Get config defaults
|
|
local_config = self.config.get("local", {})
|
|
|
|
# Determine voice and speed (param > config)
|
|
final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
|
|
final_speed = speed if speed is not None else local_config.get("speed", 1.2)
|
|
|
|
# Convert speed to rate parameter
|
|
rate = speed_to_rate(final_speed)
|
|
|
|
logger.info(f"🎙️ Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
|
|
|
|
# Generate output path if not provided
|
|
if not output_path:
|
|
# Generate unique filename
|
|
unique_id = uuid.uuid4().hex
|
|
output_path = f"output/{unique_id}.mp3"
|
|
|
|
# Ensure output directory exists
|
|
Path("output").mkdir(parents=True, exist_ok=True)
|
|
|
|
# Call Edge TTS
|
|
try:
|
|
audio_bytes = await edge_tts(
|
|
text=text,
|
|
voice=final_voice,
|
|
rate=rate,
|
|
output_path=output_path
|
|
)
|
|
|
|
logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
|
|
return output_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"Local TTS generation error: {e}")
|
|
raise
|
|
|
|
async def _call_comfyui_workflow(
|
|
self,
|
|
workflow_info: dict,
|
|
text: str,
|
|
comfyui_url: Optional[str] = None,
|
|
runninghub_api_key: Optional[str] = None,
|
|
voice: Optional[str] = None,
|
|
speed: float = 1.0,
|
|
output_path: Optional[str] = None,
|
|
**params
|
|
) -> str:
|
|
"""
|
|
Generate speech using ComfyUI workflow
|
|
|
|
Args:
|
|
workflow_info: Workflow info dict from _resolve_workflow()
|
|
text: Text to convert to speech
|
|
comfyui_url: ComfyUI URL
|
|
runninghub_api_key: RunningHub API key
|
|
voice: Voice ID (workflow-specific)
|
|
speed: Speech speed multiplier (workflow-specific)
|
|
output_path: Custom output path (downloads if URL returned)
|
|
**params: Additional workflow parameters
|
|
|
|
Returns:
|
|
Generated audio file path (local if output_path provided, otherwise URL)
|
|
"""
|
|
logger.info(f"🎙️ Using workflow: {workflow_info['key']}")
|
|
|
|
# 1. Prepare ComfyKit config (supports both selfhost and runninghub)
|
|
kit_config = self._prepare_comfykit_config(
|
|
comfyui_url=comfyui_url,
|
|
runninghub_api_key=runninghub_api_key
|
|
)
|
|
|
|
# 2. Build workflow parameters
|
|
workflow_params = {"text": text}
|
|
|
|
# Add optional TTS parameters
|
|
if voice is not None:
|
|
workflow_params["voice"] = voice
|
|
if speed != 1.0:
|
|
workflow_params["speed"] = speed
|
|
|
|
# Add any additional parameters
|
|
workflow_params.update(params)
|
|
|
|
logger.debug(f"Workflow parameters: {workflow_params}")
|
|
|
|
# 3. Execute workflow (ComfyKit auto-detects based on input type)
|
|
try:
|
|
kit = ComfyKit(**kit_config)
|
|
|
|
# Determine what to pass to ComfyKit based on source
|
|
if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
|
|
# RunningHub: pass workflow_id
|
|
workflow_input = workflow_info["workflow_id"]
|
|
logger.info(f"Executing RunningHub TTS workflow: {workflow_input}")
|
|
else:
|
|
# Selfhost: pass file path
|
|
workflow_input = workflow_info["path"]
|
|
logger.info(f"Executing selfhost TTS workflow: {workflow_input}")
|
|
|
|
result = await kit.execute(workflow_input, workflow_params)
|
|
|
|
# 4. Handle result
|
|
if result.status != "completed":
|
|
error_msg = result.msg or "Unknown error"
|
|
logger.error(f"TTS generation failed: {error_msg}")
|
|
raise Exception(f"TTS generation failed: {error_msg}")
|
|
|
|
# ComfyKit result can have audio files in different output types
|
|
# Try to get audio file path from result
|
|
audio_path = None
|
|
|
|
# Check for audio files in result.audios (if available)
|
|
if hasattr(result, 'audios') and result.audios:
|
|
audio_path = result.audios[0]
|
|
# Check for files in result.files
|
|
elif hasattr(result, 'files') and result.files:
|
|
audio_path = result.files[0]
|
|
# Check in outputs dictionary
|
|
elif hasattr(result, 'outputs') and result.outputs:
|
|
# Try to find audio file in outputs
|
|
for key, value in result.outputs.items():
|
|
if isinstance(value, str) and any(value.endswith(ext) for ext in ['.mp3', '.wav', '.flac']):
|
|
audio_path = value
|
|
break
|
|
|
|
if not audio_path:
|
|
logger.error("No audio file generated")
|
|
raise Exception("No audio file generated by workflow")
|
|
|
|
# If output_path provided and audio_path is URL, download to local
|
|
if output_path and audio_path.startswith(('http://', 'https://')):
|
|
import httpx
|
|
import os
|
|
|
|
# Ensure parent directory exists
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
logger.info(f"Downloading audio from {audio_path} to {output_path}")
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.get(audio_path)
|
|
response.raise_for_status()
|
|
|
|
with open(output_path, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
logger.info(f"✅ Generated audio (ComfyUI): {output_path}")
|
|
return output_path
|
|
|
|
logger.info(f"✅ Generated audio (ComfyUI): {audio_path}")
|
|
return audio_path
|
|
|
|
except Exception as e:
|
|
logger.error(f"TTS generation error: {e}")
|
|
raise
|