AI-Video/reelforge/utils/tts_util.py

"""
Edge TTS Utility - Temporarily not used

This is the original edge-tts implementation, kept here for potential future use.
Currently, TTS service uses ComfyUI workflows only.
"""

import asyncio
import ssl
import edge_tts as edge_tts_sdk
from loguru import logger
from aiohttp import WSServerHandshakeError, ClientResponseError


# Global flag for SSL verification (set to False for development only)
_SSL_VERIFY_ENABLED = False

# Retry configuration for Edge TTS (to handle 401 errors)
_RETRY_COUNT = 3       # Default retry count
_RETRY_DELAY = 2.0     # Retry delay in seconds


async def edge_tts(
    text: str,
    voice: str = "zh-CN-YunjianNeural",
    rate: str = "+0%",
    volume: str = "+0%",
    pitch: str = "+0Hz",
    output_path: str = None,
    retry_count: int = _RETRY_COUNT,
    retry_delay: float = _RETRY_DELAY,
) -> bytes:
    """
    Convert text to speech using Microsoft Edge TTS

    This service is free and requires no API key.
    Supports 400+ voices across 100+ languages.

    Returns audio data as bytes (MP3 format).

    Includes automatic retry mechanism to handle 401 authentication errors
    and temporary network issues (default: 3 retries with 2s delay).

    Args:
        text: Text to convert to speech
        voice: Voice ID (e.g., zh-CN-YunjianNeural, en-US-JennyNeural)
        rate: Speech rate (e.g., +0%, +50%, -20%)
        volume: Speech volume (e.g., +0%, +50%, -20%)
        pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
        output_path: Optional output file path to save audio
        retry_count: Number of retries on failure (default: 3)
        retry_delay: Delay between retries in seconds (default: 2.0)

    Returns:
        Audio data as bytes (MP3 format)

    Popular Chinese voices:
    - zh-CN-YunjianNeural (male, default)
    - zh-CN-XiaoxiaoNeural (female)
    - zh-CN-YunxiNeural (male)
    - zh-CN-XiaoyiNeural (female)

    Popular English voices:
    - en-US-JennyNeural (female)
    - en-US-GuyNeural (male)
    - en-GB-SoniaNeural (female, British)

    Example:
        audio_bytes = await edge_tts(
            text="你好，世界！",
            voice="zh-CN-YunjianNeural",
            rate="+20%"
        )
    """
    logger.debug(f"Calling Edge TTS with voice: {voice}, rate: {rate}, retry_count: {retry_count}")

    last_error = None

    # Retry loop
    for attempt in range(retry_count + 1):  # +1 because first attempt is not a retry
        try:
            if attempt > 0:
                logger.info(f"🔄 Retrying Edge TTS (attempt {attempt + 1}/{retry_count + 1}) after {retry_delay}s delay...")
                await asyncio.sleep(retry_delay)

            # Monkey patch ssl.create_default_context if SSL verification is disabled
            if not _SSL_VERIFY_ENABLED:
                if attempt == 0:  # Only log warning once
                    logger.warning("SSL verification is disabled for development. This is NOT recommended for production!")
                original_create_default_context = ssl.create_default_context

                def create_unverified_context(*args, **kwargs):
                    ctx = original_create_default_context(*args, **kwargs)
                    ctx.check_hostname = False
                    ctx.verify_mode = ssl.CERT_NONE
                    return ctx

                # Temporarily replace the function
                ssl.create_default_context = create_unverified_context

            try:
                # Create communicate instance
                communicate = edge_tts_sdk.Communicate(
                    text=text,
                    voice=voice,
                    rate=rate,
                    volume=volume,
                    pitch=pitch,
                )

                # Collect audio chunks
                audio_chunks = []
                async for chunk in communicate.stream():
                    if chunk["type"] == "audio":
                        audio_chunks.append(chunk["data"])

                audio_data = b"".join(audio_chunks)

                if attempt > 0:
                    logger.success(f"✅ Retry succeeded on attempt {attempt + 1}")

                logger.info(f"Generated {len(audio_data)} bytes of audio data")

                # Save to file if output_path is provided
                if output_path:
                    with open(output_path, "wb") as f:
                        f.write(audio_data)
                    logger.info(f"Audio saved to: {output_path}")

                return audio_data

            finally:
                # Restore original function if we patched it
                if not _SSL_VERIFY_ENABLED:
                    ssl.create_default_context = original_create_default_context

        except (WSServerHandshakeError, ClientResponseError) as e:
            # Network/authentication errors - retry
            last_error = e
            error_code = getattr(e, 'status', 'unknown')
            logger.warning(f"⚠️  Edge TTS error (attempt {attempt + 1}/{retry_count + 1}): {error_code} - {e}")

            if attempt >= retry_count:
                # Last attempt failed
                logger.error(f"❌ All {retry_count + 1} attempts failed. Giving up.")
                raise
            # Otherwise, continue to next retry

        except Exception as e:
            # Other errors - don't retry, raise immediately
            logger.error(f"Edge TTS error (non-retryable): {e}")
            raise

    # Should not reach here, but just in case
    if last_error:
        raise last_error
    else:
        raise RuntimeError("Edge TTS failed without error (unexpected)")


async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry_delay: float = _RETRY_DELAY) -> list[str]:
    """
    List all available voices for Edge TTS

    Returns a list of voice IDs (ShortName).
    Optionally filter by locale.

    Includes automatic retry mechanism to handle network errors
    (default: 3 retries with 2s delay).

    Args:
        locale: Filter by locale (e.g., zh-CN, en-US, ja-JP)
        retry_count: Number of retries on failure (default: 3)
        retry_delay: Delay between retries in seconds (default: 2.0)

    Returns:
        List of voice IDs

    Example:
        # List all voices
        voices = await list_voices()
        # Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...]

        # List Chinese voices only
        voices = await list_voices(locale="zh-CN")
        # Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...]
    """
    logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")

    last_error = None

    # Retry loop
    for attempt in range(retry_count + 1):
        try:
            if attempt > 0:
                logger.info(f"🔄 Retrying list voices (attempt {attempt + 1}/{retry_count + 1}) after {retry_delay}s delay...")
                await asyncio.sleep(retry_delay)

            # Monkey patch SSL if verification is disabled
            if not _SSL_VERIFY_ENABLED:
                if attempt == 0:  # Only log warning once
                    logger.warning("SSL verification is disabled for development. This is NOT recommended for production!")
                original_create_default_context = ssl.create_default_context

                def create_unverified_context(*args, **kwargs):
                    ctx = original_create_default_context(*args, **kwargs)
                    ctx.check_hostname = False
                    ctx.verify_mode = ssl.CERT_NONE
                    return ctx

                ssl.create_default_context = create_unverified_context

            try:
                # Get all voices
                voices = await edge_tts_sdk.list_voices()

                # Filter by locale if specified
                if locale:
                    voices = [v for v in voices if v["Locale"].startswith(locale)]

                # Extract voice IDs (ShortName)
                voice_ids = [voice["ShortName"] for voice in voices]

                if attempt > 0:
                    logger.success(f"✅ Retry succeeded on attempt {attempt + 1}")

                logger.info(f"Found {len(voice_ids)} voices" + (f" for locale '{locale}'" if locale else ""))
                return voice_ids

            finally:
                # Restore original function if we patched it
                if not _SSL_VERIFY_ENABLED:
                    ssl.create_default_context = original_create_default_context

        except (WSServerHandshakeError, ClientResponseError) as e:
            # Network/authentication errors - retry
            last_error = e
            error_code = getattr(e, 'status', 'unknown')
            logger.warning(f"⚠️  List voices error (attempt {attempt + 1}/{retry_count + 1}): {error_code} - {e}")

            if attempt >= retry_count:
                logger.error(f"❌ All {retry_count + 1} attempts failed. Giving up.")
                raise

        except Exception as e:
            # Other errors - don't retry, raise immediately
            logger.error(f"List voices error (non-retryable): {e}")
            raise

    # Should not reach here, but just in case
    if last_error:
        raise last_error
    else:
        raise RuntimeError("List voices failed without error (unexpected)")