343 lines
14 KiB
Python
343 lines
14 KiB
Python
# Copyright (C) 2025 AIDC-AI
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Edge TTS Utility - Temporarily not used
|
|
|
|
This is the original edge-tts implementation, kept here for potential future use.
|
|
Currently, TTS service uses ComfyUI workflows only.
|
|
"""
|
|
|
|
import asyncio
|
|
import ssl
|
|
import random
|
|
import edge_tts as edge_tts_sdk
|
|
from loguru import logger
|
|
from aiohttp import WSServerHandshakeError, ClientResponseError
|
|
|
|
|
|
# Global flag for SSL verification (set to False for development only)
|
|
_SSL_VERIFY_ENABLED = False
|
|
|
|
# Retry configuration for Edge TTS (to handle 401 errors)
|
|
_RETRY_COUNT = 5 # Default retry count (increased from 3 to 5)
|
|
_RETRY_BASE_DELAY = 1.0 # Base retry delay in seconds (for exponential backoff)
|
|
_MAX_RETRY_DELAY = 10.0 # Maximum retry delay in seconds
|
|
|
|
# Rate limiting configuration
|
|
_REQUEST_DELAY = 0.5 # Minimum delay before each request (seconds)
|
|
_MAX_CONCURRENT_REQUESTS = 3 # Maximum concurrent requests
|
|
|
|
# Global semaphore for rate limiting
|
|
_request_semaphore = asyncio.Semaphore(_MAX_CONCURRENT_REQUESTS)
|
|
|
|
|
|
async def edge_tts(
|
|
text: str,
|
|
voice: str = "[Chinese] zh-CN Yunjian",
|
|
rate: str = "+0%",
|
|
volume: str = "+0%",
|
|
pitch: str = "+0Hz",
|
|
output_path: str = None,
|
|
retry_count: int = _RETRY_COUNT,
|
|
retry_base_delay: float = _RETRY_BASE_DELAY,
|
|
) -> bytes:
|
|
"""
|
|
Convert text to speech using Microsoft Edge TTS
|
|
|
|
This service is free and requires no API key.
|
|
Supports 400+ voices across 100+ languages.
|
|
|
|
Returns audio data as bytes (MP3 format).
|
|
|
|
Includes automatic retry mechanism with exponential backoff and jitter
|
|
to handle 401 authentication errors and temporary network issues.
|
|
Also includes concurrent request limiting and rate limiting.
|
|
|
|
Args:
|
|
text: Text to convert to speech
|
|
voice: Voice ID (e.g., [Chinese] zh-CN Yunjian, [English] en-US Jenny)
|
|
rate: Speech rate (e.g., +0%, +50%, -20%)
|
|
volume: Speech volume (e.g., +0%, +50%, -20%)
|
|
pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
|
|
output_path: Optional output file path to save audio
|
|
retry_count: Number of retries on failure (default: 5)
|
|
retry_base_delay: Base delay for exponential backoff (default: 1.0s)
|
|
|
|
Returns:
|
|
Audio data as bytes (MP3 format)
|
|
|
|
Popular Chinese voices:
|
|
- [Chinese] zh-CN Yunjian (male, default)
|
|
- [Chinese] zh-CN Xiaoxiao (female)
|
|
- [Chinese] zh-CN Yunxi (male)
|
|
- [Chinese] zh-CN Xiaoyi (female)
|
|
|
|
Popular English voices:
|
|
- [English] en-US Jenny (female)
|
|
- [English] en-US Guy (male)
|
|
- [English] en-GB Sonia (female, British)
|
|
|
|
Example:
|
|
audio_bytes = await edge_tts(
|
|
text="你好,世界!",
|
|
voice="[Chinese] zh-CN Yunjian",
|
|
rate="+20%"
|
|
)
|
|
"""
|
|
logger.debug(f"Calling Edge TTS with voice: {voice}, rate: {rate}, retry_count: {retry_count}")
|
|
|
|
# Use semaphore to limit concurrent requests
|
|
async with _request_semaphore:
|
|
# Add a small random delay before each request to avoid rate limiting
|
|
pre_delay = _REQUEST_DELAY + random.uniform(0, 0.3)
|
|
logger.debug(f"Waiting {pre_delay:.2f}s before request (rate limiting)")
|
|
await asyncio.sleep(pre_delay)
|
|
|
|
last_error = None
|
|
|
|
# Retry loop
|
|
for attempt in range(retry_count + 1): # +1 because first attempt is not a retry
|
|
if attempt > 0:
|
|
# Exponential backoff with jitter
|
|
# delay = base * (2 ^ attempt) + random jitter
|
|
exponential_delay = retry_base_delay * (2 ** (attempt - 1))
|
|
jitter = random.uniform(0, retry_base_delay)
|
|
retry_delay = min(exponential_delay + jitter, _MAX_RETRY_DELAY)
|
|
|
|
logger.info(f"🔄 Retrying Edge TTS (attempt {attempt + 1}/{retry_count + 1}) after {retry_delay:.2f}s delay...")
|
|
await asyncio.sleep(retry_delay)
|
|
|
|
# Monkey patch ssl.create_default_context if SSL verification is disabled
|
|
if not _SSL_VERIFY_ENABLED:
|
|
if attempt == 0: # Only log warning once
|
|
logger.warning("SSL verification is disabled for development. This is NOT recommended for production!")
|
|
original_create_default_context = ssl.create_default_context
|
|
|
|
def create_unverified_context(*args, **kwargs):
|
|
ctx = original_create_default_context(*args, **kwargs)
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
return ctx
|
|
|
|
# Temporarily replace the function
|
|
ssl.create_default_context = create_unverified_context
|
|
|
|
try:
|
|
# Create communicate instance
|
|
communicate = edge_tts_sdk.Communicate(
|
|
text=text,
|
|
voice=voice,
|
|
rate=rate,
|
|
volume=volume,
|
|
pitch=pitch,
|
|
)
|
|
|
|
# Collect audio chunks
|
|
audio_chunks = []
|
|
async for chunk in communicate.stream():
|
|
if chunk["type"] == "audio":
|
|
audio_chunks.append(chunk["data"])
|
|
|
|
audio_data = b"".join(audio_chunks)
|
|
|
|
if attempt > 0:
|
|
logger.success(f"✅ Retry succeeded on attempt {attempt + 1}")
|
|
|
|
logger.info(f"Generated {len(audio_data)} bytes of audio data")
|
|
|
|
# Save to file if output_path is provided
|
|
if output_path:
|
|
with open(output_path, "wb") as f:
|
|
f.write(audio_data)
|
|
logger.info(f"Audio saved to: {output_path}")
|
|
|
|
return audio_data
|
|
|
|
except (WSServerHandshakeError, ClientResponseError) as e:
|
|
# Network/authentication errors - retry
|
|
last_error = e
|
|
error_code = getattr(e, 'status', 'unknown')
|
|
error_msg = str(e)
|
|
|
|
# Log more detailed information for 401 errors
|
|
if error_code == 401 or '401' in error_msg:
|
|
logger.warning(f"⚠️ Edge TTS 401 Authentication Error (attempt {attempt + 1}/{retry_count + 1})")
|
|
logger.debug(f"Error details: {error_msg}")
|
|
logger.debug(f"This is usually caused by rate limiting. Will retry with exponential backoff...")
|
|
else:
|
|
logger.warning(f"⚠️ Edge TTS error (attempt {attempt + 1}/{retry_count + 1}): {error_code} - {e}")
|
|
|
|
if attempt >= retry_count:
|
|
# Last attempt failed
|
|
logger.error(f"❌ All {retry_count + 1} attempts failed. Last error: {error_code}")
|
|
raise
|
|
# Otherwise, continue to next retry
|
|
|
|
except Exception as e:
|
|
# Other errors - don't retry, raise immediately
|
|
logger.error(f"Edge TTS error (non-retryable): {type(e).__name__} - {e}")
|
|
raise
|
|
|
|
finally:
|
|
# Restore original function if we patched it
|
|
if not _SSL_VERIFY_ENABLED:
|
|
ssl.create_default_context = original_create_default_context
|
|
|
|
# Should not reach here, but just in case
|
|
if last_error:
|
|
raise last_error
|
|
else:
|
|
raise RuntimeError("Edge TTS failed without error (unexpected)")
|
|
|
|
|
|
def get_audio_duration(audio_path: str) -> float:
|
|
"""
|
|
Get audio file duration in seconds
|
|
|
|
Args:
|
|
audio_path: Path to audio file
|
|
|
|
Returns:
|
|
Duration in seconds
|
|
"""
|
|
try:
|
|
# Try using ffmpeg-python
|
|
import ffmpeg
|
|
probe = ffmpeg.probe(audio_path)
|
|
duration = float(probe['format']['duration'])
|
|
return duration
|
|
except Exception as e:
|
|
logger.warning(f"Failed to get audio duration: {e}, using estimate")
|
|
# Fallback: estimate based on file size (very rough)
|
|
import os
|
|
file_size = os.path.getsize(audio_path)
|
|
# Assume ~16kbps for MP3, so 2KB per second
|
|
estimated_duration = file_size / 2000
|
|
return max(1.0, estimated_duration) # At least 1 second
|
|
|
|
|
|
async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry_base_delay: float = _RETRY_BASE_DELAY) -> list[str]:
|
|
"""
|
|
List all available voices for Edge TTS
|
|
|
|
Returns a list of voice IDs (ShortName).
|
|
Optionally filter by locale.
|
|
|
|
Includes automatic retry mechanism with exponential backoff and jitter
|
|
to handle network errors and rate limiting.
|
|
|
|
Args:
|
|
locale: Filter by locale (e.g., zh-CN, en-US, ja-JP)
|
|
retry_count: Number of retries on failure (default: 5)
|
|
retry_base_delay: Base delay for exponential backoff (default: 1.0s)
|
|
|
|
Returns:
|
|
List of voice IDs
|
|
|
|
Example:
|
|
# List all voices
|
|
voices = await list_voices()
|
|
# Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
|
|
|
|
# List Chinese voices only
|
|
voices = await list_voices(locale="zh-CN")
|
|
# Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
|
|
"""
|
|
logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")
|
|
|
|
# Use semaphore to limit concurrent requests
|
|
async with _request_semaphore:
|
|
# Add a small random delay before each request to avoid rate limiting
|
|
pre_delay = _REQUEST_DELAY + random.uniform(0, 0.3)
|
|
logger.debug(f"Waiting {pre_delay:.2f}s before request (rate limiting)")
|
|
await asyncio.sleep(pre_delay)
|
|
|
|
last_error = None
|
|
|
|
# Retry loop
|
|
for attempt in range(retry_count + 1):
|
|
if attempt > 0:
|
|
# Exponential backoff with jitter
|
|
exponential_delay = retry_base_delay * (2 ** (attempt - 1))
|
|
jitter = random.uniform(0, retry_base_delay)
|
|
retry_delay = min(exponential_delay + jitter, _MAX_RETRY_DELAY)
|
|
|
|
logger.info(f"🔄 Retrying list voices (attempt {attempt + 1}/{retry_count + 1}) after {retry_delay:.2f}s delay...")
|
|
await asyncio.sleep(retry_delay)
|
|
|
|
# Monkey patch SSL if verification is disabled
|
|
if not _SSL_VERIFY_ENABLED:
|
|
if attempt == 0: # Only log warning once
|
|
logger.warning("SSL verification is disabled for development. This is NOT recommended for production!")
|
|
original_create_default_context = ssl.create_default_context
|
|
|
|
def create_unverified_context(*args, **kwargs):
|
|
ctx = original_create_default_context(*args, **kwargs)
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
return ctx
|
|
|
|
ssl.create_default_context = create_unverified_context
|
|
|
|
try:
|
|
# Get all voices
|
|
voices = await edge_tts_sdk.list_voices()
|
|
|
|
# Filter by locale if specified
|
|
if locale:
|
|
voices = [v for v in voices if v["Locale"].startswith(locale)]
|
|
|
|
# Extract voice IDs (ShortName)
|
|
voice_ids = [voice["ShortName"] for voice in voices]
|
|
|
|
if attempt > 0:
|
|
logger.success(f"✅ Retry succeeded on attempt {attempt + 1}")
|
|
|
|
logger.info(f"Found {len(voice_ids)} voices" + (f" for locale '{locale}'" if locale else ""))
|
|
return voice_ids
|
|
|
|
except (WSServerHandshakeError, ClientResponseError) as e:
|
|
# Network/authentication errors - retry
|
|
last_error = e
|
|
error_code = getattr(e, 'status', 'unknown')
|
|
error_msg = str(e)
|
|
|
|
# Log more detailed information for 401 errors
|
|
if error_code == 401 or '401' in error_msg:
|
|
logger.warning(f"⚠️ Edge TTS 401 Authentication Error (list_voices attempt {attempt + 1}/{retry_count + 1})")
|
|
logger.debug(f"Error details: {error_msg}")
|
|
logger.debug(f"This is usually caused by rate limiting. Will retry with exponential backoff...")
|
|
else:
|
|
logger.warning(f"⚠️ List voices error (attempt {attempt + 1}/{retry_count + 1}): {error_code} - {e}")
|
|
|
|
if attempt >= retry_count:
|
|
logger.error(f"❌ All {retry_count + 1} attempts failed. Last error: {error_code}")
|
|
raise
|
|
|
|
except Exception as e:
|
|
# Other errors - don't retry, raise immediately
|
|
logger.error(f"List voices error (non-retryable): {type(e).__name__} - {e}")
|
|
raise
|
|
|
|
finally:
|
|
# Restore original function if we patched it
|
|
if not _SSL_VERIFY_ENABLED:
|
|
ssl.create_default_context = original_create_default_context
|
|
|
|
# Should not reach here, but just in case
|
|
if last_error:
|
|
raise last_error
|
|
else:
|
|
raise RuntimeError("List voices failed without error (unexpected)")
|
|
|