83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
"""
|
|
TTS (Text-to-Speech) endpoints
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException
|
|
from loguru import logger
|
|
|
|
from api.dependencies import PixelleVideoDep
|
|
from api.schemas.tts import TTSSynthesizeRequest, TTSSynthesizeResponse
|
|
from pixelle_video.utils.tts_util import get_audio_duration
|
|
|
|
router = APIRouter(prefix="/tts", tags=["Basic Services"])
|
|
|
|
|
|
@router.post("/synthesize", response_model=TTSSynthesizeResponse)
|
|
async def tts_synthesize(
|
|
request: TTSSynthesizeRequest,
|
|
pixelle_video: PixelleVideoDep
|
|
):
|
|
"""
|
|
Text-to-Speech synthesis endpoint
|
|
|
|
Convert text to speech audio using ComfyUI workflows.
|
|
|
|
- **text**: Text to synthesize
|
|
- **workflow**: TTS workflow key (optional, uses default if not specified)
|
|
- **ref_audio**: Reference audio for voice cloning (optional)
|
|
- **voice_id**: (Deprecated) Voice ID for legacy compatibility
|
|
|
|
Returns path to generated audio file and duration.
|
|
|
|
Examples:
|
|
```json
|
|
{
|
|
"text": "Hello, welcome to Pixelle-Video!",
|
|
"workflow": "runninghub/tts_edge.json"
|
|
}
|
|
```
|
|
|
|
With voice cloning:
|
|
```json
|
|
{
|
|
"text": "Hello, this is a cloned voice",
|
|
"workflow": "runninghub/tts_index2.json",
|
|
"ref_audio": "path/to/reference.wav"
|
|
}
|
|
```
|
|
"""
|
|
try:
|
|
logger.info(f"TTS synthesis request: {request.text[:50]}...")
|
|
|
|
# Build TTS parameters
|
|
tts_params = {"text": request.text}
|
|
|
|
# Add workflow if specified
|
|
if request.workflow:
|
|
tts_params["workflow"] = request.workflow
|
|
|
|
# Add ref_audio if specified
|
|
if request.ref_audio:
|
|
tts_params["ref_audio"] = request.ref_audio
|
|
|
|
# Legacy voice_id support (deprecated)
|
|
if request.voice_id and not request.workflow:
|
|
logger.warning("voice_id parameter is deprecated, please use workflow instead")
|
|
tts_params["voice"] = request.voice_id
|
|
|
|
# Call TTS service
|
|
audio_path = await pixelle_video.tts(**tts_params)
|
|
|
|
# Get audio duration
|
|
duration = get_audio_duration(audio_path)
|
|
|
|
return TTSSynthesizeResponse(
|
|
audio_path=audio_path,
|
|
duration=duration
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"TTS synthesis error: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|