Files
AI-Video/pixelle_video/services/tts_service.py

319 lines
12 KiB
Python

# Copyright (C) 2025 AIDC-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
"""
import os
import uuid
from pathlib import Path
from typing import Optional
from comfykit import ComfyKit
from loguru import logger
from pixelle_video.services.comfy_base_service import ComfyBaseService
from pixelle_video.utils.tts_util import edge_tts
from pixelle_video.tts_voices import speed_to_rate
class TTSService(ComfyBaseService):
"""
TTS (Text-to-Speech) service - Workflow-based
Uses ComfyKit to execute TTS workflows.
Usage:
# Use default workflow
audio_path = await pixelle_video.tts(text="Hello, world!")
# Use specific workflow
audio_path = await pixelle_video.tts(
text="你好,世界!",
workflow="tts_edge.json"
)
# List available workflows
workflows = pixelle_video.tts.list_workflows()
"""
WORKFLOW_PREFIX = "tts_"
DEFAULT_WORKFLOW = None # No hardcoded default, must be configured
WORKFLOWS_DIR = "workflows"
def __init__(self, config: dict, core=None):
"""
Initialize TTS service
Args:
config: Full application config dict
core: PixelleVideoCore instance (for accessing shared ComfyKit)
"""
super().__init__(config, service_name="tts", core=core)
async def __call__(
self,
text: str,
workflow: Optional[str] = None,
# ComfyUI connection (optional overrides)
comfyui_url: Optional[str] = None,
runninghub_api_key: Optional[str] = None,
# TTS parameters
voice: Optional[str] = None,
speed: Optional[float] = None,
# Inference mode override
inference_mode: Optional[str] = None,
# Output path
output_path: Optional[str] = None,
**params
) -> str:
"""
Generate speech using local Edge TTS or ComfyUI workflow
Args:
text: Text to convert to speech
workflow: Workflow filename (for ComfyUI mode, default: from config)
comfyui_url: ComfyUI URL (optional, overrides config)
runninghub_api_key: RunningHub API key (optional, overrides config)
voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
inference_mode: Override inference mode ("local" or "comfyui", default: from config)
output_path: Custom output path (auto-generated if None)
**params: Additional workflow parameters
Returns:
Generated audio file path
Examples:
# Local inference (Edge TTS)
audio_path = await pixelle_video.tts(
text="Hello, world!",
inference_mode="local",
voice="zh-CN-YunjianNeural",
speed=1.2
)
# ComfyUI inference
audio_path = await pixelle_video.tts(
text="你好,世界!",
inference_mode="comfyui",
workflow="runninghub/tts_edge.json"
)
"""
# Determine inference mode (param > config)
mode = inference_mode or self.config.get("inference_mode", "local")
# Route to appropriate implementation
if mode == "local":
return await self._call_local_tts(
text=text,
voice=voice,
speed=speed,
output_path=output_path
)
else: # comfyui
# 1. Resolve workflow (returns structured info)
workflow_info = self._resolve_workflow(workflow=workflow)
# 2. Execute ComfyUI workflow
return await self._call_comfyui_workflow(
workflow_info=workflow_info,
text=text,
comfyui_url=comfyui_url,
runninghub_api_key=runninghub_api_key,
voice=voice,
speed=speed,
output_path=output_path,
**params
)
async def _call_local_tts(
self,
text: str,
voice: Optional[str] = None,
speed: Optional[float] = None,
output_path: Optional[str] = None,
) -> str:
"""
Generate speech using local Edge TTS
Args:
text: Text to convert to speech
voice: Edge TTS voice ID (default: from config)
speed: Speech speed multiplier (default: from config)
output_path: Custom output path (auto-generated if None)
Returns:
Generated audio file path
"""
# Get config defaults
local_config = self.config.get("local", {})
# Determine voice and speed (param > config)
final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
final_speed = speed if speed is not None else local_config.get("speed", 1.2)
# Convert speed to rate parameter
rate = speed_to_rate(final_speed)
logger.info(f"🎙️ Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
# Generate output path if not provided
if not output_path:
# Generate unique filename
unique_id = uuid.uuid4().hex
output_path = f"output/{unique_id}.mp3"
# Ensure output directory exists
Path("output").mkdir(parents=True, exist_ok=True)
# Call Edge TTS
try:
audio_bytes = await edge_tts(
text=text,
voice=final_voice,
rate=rate,
output_path=output_path
)
logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
return output_path
except Exception as e:
logger.error(f"Local TTS generation error: {e}")
raise
async def _call_comfyui_workflow(
self,
workflow_info: dict,
text: str,
comfyui_url: Optional[str] = None,
runninghub_api_key: Optional[str] = None,
voice: Optional[str] = None,
speed: float = 1.0,
output_path: Optional[str] = None,
**params
) -> str:
"""
Generate speech using ComfyUI workflow
Args:
workflow_info: Workflow info dict from _resolve_workflow()
text: Text to convert to speech
comfyui_url: ComfyUI URL
runninghub_api_key: RunningHub API key
voice: Voice ID (workflow-specific)
speed: Speech speed multiplier (workflow-specific)
output_path: Custom output path (downloads if URL returned)
**params: Additional workflow parameters
Returns:
Generated audio file path (local if output_path provided, otherwise URL)
"""
logger.info(f"🎙️ Using workflow: {workflow_info['key']}")
# 1. Build workflow parameters (ComfyKit config is now managed by core)
workflow_params = {"text": text}
# Add optional TTS parameters (only if explicitly provided and not None)
if voice is not None:
workflow_params["voice"] = voice
if speed is not None and speed != 1.0:
workflow_params["speed"] = speed
# Add any additional parameters
workflow_params.update(params)
logger.debug(f"Workflow parameters: {workflow_params}")
# 3. Execute workflow using shared ComfyKit instance from core
try:
# Get shared ComfyKit instance (lazy initialization + config hot-reload)
kit = await self.core._get_or_create_comfykit()
# Determine what to pass to ComfyKit based on source
if workflow_info["source"] == "runninghub" and "workflow_id" in workflow_info:
# RunningHub: pass workflow_id
workflow_input = workflow_info["workflow_id"]
logger.info(f"Executing RunningHub TTS workflow: {workflow_input}")
else:
# Selfhost: pass file path
workflow_input = workflow_info["path"]
logger.info(f"Executing selfhost TTS workflow: {workflow_input}")
result = await kit.execute(workflow_input, workflow_params)
# 4. Handle result
if result.status != "completed":
error_msg = result.msg or "Unknown error"
logger.error(f"TTS generation failed: {error_msg}")
raise Exception(f"TTS generation failed: {error_msg}")
# ComfyKit result can have audio files in different output types
# Try to get audio file path from result
audio_path = None
# Check for audio files in result.audios (if available)
if hasattr(result, 'audios') and result.audios:
audio_path = result.audios[0]
logger.debug(f"✅ Found audio in result.audios: {audio_path}")
# Check for files in result.files
elif hasattr(result, 'files') and result.files:
audio_path = result.files[0]
logger.debug(f"✅ Found audio in result.files: {audio_path}")
# Check in outputs dictionary
elif hasattr(result, 'outputs') and result.outputs:
logger.debug(f"Searching for audio file in result.outputs: {result.outputs}")
# Try to find audio file in outputs
for key, value in result.outputs.items():
if isinstance(value, str) and any(value.endswith(ext) for ext in ['.mp3', '.wav', '.flac']):
audio_path = value
logger.debug(f"✅ Found audio in result.outputs[{key}]: {audio_path}")
break
if not audio_path:
logger.error("No audio file generated")
logger.error(f"❌ Result analysis:")
logger.error(f" - result.audios: {getattr(result, 'audios', 'NOT_FOUND')}")
logger.error(f" - result.files: {getattr(result, 'files', 'NOT_FOUND')}")
logger.error(f" - result.outputs: {getattr(result, 'outputs', 'NOT_FOUND')}")
logger.error(f" - Full __dict__: {result.__dict__}")
raise Exception("No audio file generated by workflow")
# If output_path provided and audio_path is URL, download to local
if output_path and audio_path.startswith(('http://', 'https://')):
import httpx
import os
# Ensure parent directory exists
os.makedirs(os.path.dirname(output_path), exist_ok=True)
logger.info(f"Downloading audio from {audio_path} to {output_path}")
async with httpx.AsyncClient() as client:
response = await client.get(audio_path)
response.raise_for_status()
with open(output_path, 'wb') as f:
f.write(response.content)
logger.info(f"✅ Generated audio (ComfyUI): {output_path}")
return output_path
logger.info(f"✅ Generated audio (ComfyUI): {audio_path}")
return audio_path
except Exception as e:
logger.error(f"TTS generation error: {e}")
raise