项目重命名: ReelForge => Pixelle-Video

2025-10-31 10:23:38 +08:00
parent dfdba73b74
commit 136514e466
60 changed files with 384 additions and 383 deletions
--- a/pixelle_video/utils/init.py
+++ b/pixelle_video/utils/init.py
@@ -0,0 +1,3 @@
+"""
+Pixelle-Video utilities
+"""
--- a/pixelle_video/utils/os_util.py
+++ b/pixelle_video/utils/os_util.py
@@ -0,0 +1,285 @@
+"""
+OS utilities for file and path management
+
+Provides utilities for managing paths and files in Pixelle-Video.
+Inspired by Pixelle-MCP's os_util.py.
+"""
+
+import os
+import random
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Tuple, Literal
+
+
+def get_pixelle_video_root_path() -> str:
+    """
+    Get Pixelle-Video root path - current working directory
+    
+    Returns:
+        Current working directory as string
+    """
+    return str(Path.cwd())
+
+
+def ensure_pixelle_video_root_path() -> str:
+    """
+    Ensure Pixelle-Video root path exists and return the path
+    
+    Returns:
+        Root path as string
+    """
+    root_path = get_pixelle_video_root_path()
+    root_path_obj = Path(root_path)
+    output_dir = root_path_obj / 'output'
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    return root_path
+
+
+def get_root_path(*paths: str) -> str:
+    """
+    Get path relative to Pixelle-Video root path
+    
+    Args:
+        *paths: Path components to join
+    
+    Returns:
+        Absolute path as string
+    
+    Example:
+        get_root_path("temp", "audio.mp3")
+        # Returns: "/path/to/project/temp/audio.mp3"
+    """
+    root_path = ensure_pixelle_video_root_path()
+    if paths:
+        return os.path.join(root_path, *paths)
+    return root_path
+
+
+def get_temp_path(*paths: str) -> str:
+    """
+    Get path relative to Pixelle-Video temp folder
+    
+    Args:
+        *paths: Path components to join
+    
+    Returns:
+        Absolute path to temp directory or file
+    
+    Example:
+        get_temp_path("audio.mp3")
+        # Returns: "/path/to/project/temp/audio.mp3"
+    """
+    temp_path = get_root_path("temp")
+    if paths:
+        return os.path.join(temp_path, *paths)
+    return temp_path
+
+
+def get_data_path(*paths: str) -> str:
+    """
+    Get path relative to Pixelle-Video data folder
+    
+    Args:
+        *paths: Path components to join
+    
+    Returns:
+        Absolute path to data directory or file
+    
+    Example:
+        get_data_path("videos", "output.mp4")
+        # Returns: "/path/to/project/data/videos/output.mp4"
+    """
+    data_path = get_root_path("data")
+    if paths:
+        return os.path.join(data_path, *paths)
+    return data_path
+
+
+def get_output_path(*paths: str) -> str:
+    """
+    Get path relative to Pixelle-Video output folder
+    
+    Args:
+        *paths: Path components to join
+    
+    Returns:
+        Absolute path to output directory or file
+    
+    Example:
+        get_output_path("video.mp4")
+        # Returns: "/path/to/project/output/video.mp4"
+    """
+    output_path = get_root_path("output")
+    if paths:
+        return os.path.join(output_path, *paths)
+    return output_path
+
+
+def save_bytes_to_file(data: bytes, file_path: str) -> str:
+    """
+    Save bytes data to file
+    
+    Creates parent directories if they don't exist.
+    
+    Args:
+        data: Binary data to save
+        file_path: Target file path
+    
+    Returns:
+        Absolute path of saved file
+    
+    Example:
+        save_bytes_to_file(audio_data, get_temp_path("audio.mp3"))
+    """
+    # Ensure parent directory exists
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    
+    # Write binary data
+    with open(file_path, "wb") as f:
+        f.write(data)
+    
+    return os.path.abspath(file_path)
+
+
+def ensure_dir(path: str) -> str:
+    """
+    Ensure directory exists, create if not
+    
+    Args:
+        path: Directory path
+    
+    Returns:
+        Absolute path of directory
+    """
+    os.makedirs(path, exist_ok=True)
+    return os.path.abspath(path)
+
+
+# ========== Task Directory Management ==========
+
+def create_task_id() -> str:
+    """
+    Create unique task ID with timestamp + random suffix
+    
+    Format: {timestamp}_{random_hex}
+    Example: "20251028_143052_ab3d"
+    
+    Collision probability: < 0.0001% (65536 combinations per second)
+    
+    Returns:
+        Task ID string
+    """
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    random_suffix = f"{random.randint(0, 0xFFFF):04x}"  # 4-digit hex (0000-ffff)
+    return f"{timestamp}_{random_suffix}"
+
+
+def create_task_output_dir(task_id: Optional[str] = None) -> Tuple[str, str]:
+    """
+    Create isolated output directory for single video generation task
+    
+    Directory structure:
+        output/{task_id}/
+        ├── final.mp4           # Final video output
+        ├── frames/             # All frame-related files
+        │   ├── 01_audio.mp3
+        │   ├── 01_image.png
+        │   ├── 01_composed.png
+        │   ├── 01_segment.mp4
+        │   └── ...
+        └── metadata.json       # Optional: task metadata
+    
+    Args:
+        task_id: Optional task ID (auto-generated if None)
+    
+    Returns:
+        (task_dir, task_id) tuple
+        
+    Example:
+        >>> task_dir, task_id = create_task_output_dir()
+        >>> # task_dir = "/path/to/project/output/20251028_143052_ab3d"
+        >>> # task_id = "20251028_143052_ab3d"
+    """
+    if task_id is None:
+        task_id = create_task_id()
+    
+    task_dir = get_output_path(task_id)
+    frames_dir = os.path.join(task_dir, "frames")
+    
+    # Create directories
+    os.makedirs(frames_dir, exist_ok=True)
+    
+    return task_dir, task_id
+
+
+def get_task_path(task_id: str, *paths: str) -> str:
+    """
+    Get path within task directory
+    
+    Args:
+        task_id: Task ID
+        *paths: Path components to join
+    
+    Returns:
+        Absolute path within task directory
+        
+    Example:
+        >>> get_task_path("20251028_143052_ab3d", "final.mp4")
+        >>> # Returns: "/path/to/project/output/20251028_143052_ab3d/final.mp4"
+    """
+    task_dir = get_output_path(task_id)
+    if paths:
+        return os.path.join(task_dir, *paths)
+    return task_dir
+
+
+def get_task_frame_path(
+    task_id: str, 
+    frame_index: int, 
+    file_type: Literal["audio", "image", "composed", "segment"]
+) -> str:
+    """
+    Get frame file path within task directory
+    
+    Args:
+        task_id: Task ID
+        frame_index: Frame index (0-based internally, but filename starts from 01)
+        file_type: File type (audio/image/composed/segment)
+    
+    Returns:
+        Absolute path to frame file
+        
+    Example:
+        >>> get_task_frame_path("20251028_143052_ab3d", 0, "audio")
+        >>> # Returns: ".../output/20251028_143052_ab3d/frames/01_audio.mp3"
+    """
+    ext_map = {
+        "audio": "mp3",
+        "image": "png",
+        "composed": "png",
+        "segment": "mp4"
+    }
+    
+    # Frame number starts from 01 for better human readability
+    filename = f"{frame_index + 1:02d}_{file_type}.{ext_map[file_type]}"
+    return get_task_path(task_id, "frames", filename)
+
+
+def get_task_final_video_path(task_id: str) -> str:
+    """
+    Get final video path within task directory
+    
+    Args:
+        task_id: Task ID
+    
+    Returns:
+        Absolute path to final video
+        
+    Example:
+        >>> get_task_final_video_path("20251028_143052_ab3d")
+        >>> # Returns: ".../output/20251028_143052_ab3d/final.mp4"
+    """
+    return get_task_path(task_id, "final.mp4")
+
--- a/pixelle_video/utils/prompt_helper.py
+++ b/pixelle_video/utils/prompt_helper.py
@@ -0,0 +1,38 @@
+"""
+Prompt helper utilities
+
+Simple utilities for building prompts with optional prefixes.
+"""
+
+
+def build_image_prompt(prompt: str, prefix: str = "") -> str:
+    """
+    Build final image prompt with optional prefix
+    
+    Args:
+        prompt: User's raw prompt
+        prefix: Optional prefix to add before the prompt
+    
+    Returns:
+        Final prompt with prefix applied (if provided)
+    
+    Examples:
+        >>> build_image_prompt("a cat", "")
+        'a cat'
+        
+        >>> build_image_prompt("a cat", "anime style")
+        'anime style, a cat'
+        
+        >>> build_image_prompt("a cat", "  anime style  ")
+        'anime style, a cat'
+    """
+    prefix = prefix.strip() if prefix else ""
+    prompt = prompt.strip() if prompt else ""
+    
+    if prefix and prompt:
+        return f"{prefix}, {prompt}"
+    elif prefix:
+        return prefix
+    else:
+        return prompt
+
--- a/pixelle_video/utils/tts_util.py
+++ b/pixelle_video/utils/tts_util.py
@@ -0,0 +1,330 @@
+"""
+Edge TTS Utility - Temporarily not used
+
+This is the original edge-tts implementation, kept here for potential future use.
+Currently, TTS service uses ComfyUI workflows only.
+"""
+
+import asyncio
+import ssl
+import random
+import edge_tts as edge_tts_sdk
+from loguru import logger
+from aiohttp import WSServerHandshakeError, ClientResponseError
+
+
+# Global flag for SSL verification (set to False for development only)
+_SSL_VERIFY_ENABLED = False
+
+# Retry configuration for Edge TTS (to handle 401 errors)
+_RETRY_COUNT = 5       # Default retry count (increased from 3 to 5)
+_RETRY_BASE_DELAY = 1.0     # Base retry delay in seconds (for exponential backoff)
+_MAX_RETRY_DELAY = 10.0     # Maximum retry delay in seconds
+
+# Rate limiting configuration
+_REQUEST_DELAY = 0.5        # Minimum delay before each request (seconds)
+_MAX_CONCURRENT_REQUESTS = 3  # Maximum concurrent requests
+
+# Global semaphore for rate limiting
+_request_semaphore = asyncio.Semaphore(_MAX_CONCURRENT_REQUESTS)
+
+
+async def edge_tts(
+    text: str,
+    voice: str = "[Chinese] zh-CN Yunjian",
+    rate: str = "+0%",
+    volume: str = "+0%",
+    pitch: str = "+0Hz",
+    output_path: str = None,
+    retry_count: int = _RETRY_COUNT,
+    retry_base_delay: float = _RETRY_BASE_DELAY,
+) -> bytes:
+    """
+    Convert text to speech using Microsoft Edge TTS
+    
+    This service is free and requires no API key.
+    Supports 400+ voices across 100+ languages.
+    
+    Returns audio data as bytes (MP3 format).
+    
+    Includes automatic retry mechanism with exponential backoff and jitter
+    to handle 401 authentication errors and temporary network issues.
+    Also includes concurrent request limiting and rate limiting.
+    
+    Args:
+        text: Text to convert to speech
+        voice: Voice ID (e.g., [Chinese] zh-CN Yunjian, [English] en-US Jenny)
+        rate: Speech rate (e.g., +0%, +50%, -20%)
+        volume: Speech volume (e.g., +0%, +50%, -20%)
+        pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
+        output_path: Optional output file path to save audio
+        retry_count: Number of retries on failure (default: 5)
+        retry_base_delay: Base delay for exponential backoff (default: 1.0s)
+    
+    Returns:
+        Audio data as bytes (MP3 format)
+    
+    Popular Chinese voices:
+    - [Chinese] zh-CN Yunjian (male, default)
+    - [Chinese] zh-CN Xiaoxiao (female)
+    - [Chinese] zh-CN Yunxi (male)
+    - [Chinese] zh-CN Xiaoyi (female)
+    
+    Popular English voices:
+    - [English] en-US Jenny (female)
+    - [English] en-US Guy (male)
+    - [English] en-GB Sonia (female, British)
+    
+    Example:
+        audio_bytes = await edge_tts(
+            text="你好，世界！",
+            voice="[Chinese] zh-CN Yunjian",
+            rate="+20%"
+        )
+    """
+    logger.debug(f"Calling Edge TTS with voice: {voice}, rate: {rate}, retry_count: {retry_count}")
+    
+    # Use semaphore to limit concurrent requests
+    async with _request_semaphore:
+        # Add a small random delay before each request to avoid rate limiting
+        pre_delay = _REQUEST_DELAY + random.uniform(0, 0.3)
+        logger.debug(f"Waiting {pre_delay:.2f}s before request (rate limiting)")
+        await asyncio.sleep(pre_delay)
+        
+        last_error = None
+        
+        # Retry loop
+        for attempt in range(retry_count + 1):  # +1 because first attempt is not a retry
+            if attempt > 0:
+                # Exponential backoff with jitter
+                # delay = base * (2 ^ attempt) + random jitter
+                exponential_delay = retry_base_delay * (2 ** (attempt - 1))
+                jitter = random.uniform(0, retry_base_delay)
+                retry_delay = min(exponential_delay + jitter, _MAX_RETRY_DELAY)
+                
+                logger.info(f"🔄 Retrying Edge TTS (attempt {attempt + 1}/{retry_count + 1}) after {retry_delay:.2f}s delay...")
+                await asyncio.sleep(retry_delay)
+            
+            # Monkey patch ssl.create_default_context if SSL verification is disabled
+            if not _SSL_VERIFY_ENABLED:
+                if attempt == 0:  # Only log warning once
+                    logger.warning("SSL verification is disabled for development. This is NOT recommended for production!")
+                original_create_default_context = ssl.create_default_context
+                
+                def create_unverified_context(*args, **kwargs):
+                    ctx = original_create_default_context(*args, **kwargs)
+                    ctx.check_hostname = False
+                    ctx.verify_mode = ssl.CERT_NONE
+                    return ctx
+                
+                # Temporarily replace the function
+                ssl.create_default_context = create_unverified_context
+            
+            try:
+                # Create communicate instance
+                communicate = edge_tts_sdk.Communicate(
+                    text=text,
+                    voice=voice,
+                    rate=rate,
+                    volume=volume,
+                    pitch=pitch,
+                )
+                
+                # Collect audio chunks
+                audio_chunks = []
+                async for chunk in communicate.stream():
+                    if chunk["type"] == "audio":
+                        audio_chunks.append(chunk["data"])
+                
+                audio_data = b"".join(audio_chunks)
+                
+                if attempt > 0:
+                    logger.success(f"✅ Retry succeeded on attempt {attempt + 1}")
+                
+                logger.info(f"Generated {len(audio_data)} bytes of audio data")
+                
+                # Save to file if output_path is provided
+                if output_path:
+                    with open(output_path, "wb") as f:
+                        f.write(audio_data)
+                    logger.info(f"Audio saved to: {output_path}")
+                
+                return audio_data
+            
+            except (WSServerHandshakeError, ClientResponseError) as e:
+                # Network/authentication errors - retry
+                last_error = e
+                error_code = getattr(e, 'status', 'unknown')
+                error_msg = str(e)
+                
+                # Log more detailed information for 401 errors
+                if error_code == 401 or '401' in error_msg:
+                    logger.warning(f"⚠️  Edge TTS 401 Authentication Error (attempt {attempt + 1}/{retry_count + 1})")
+                    logger.debug(f"Error details: {error_msg}")
+                    logger.debug(f"This is usually caused by rate limiting. Will retry with exponential backoff...")
+                else:
+                    logger.warning(f"⚠️  Edge TTS error (attempt {attempt + 1}/{retry_count + 1}): {error_code} - {e}")
+                
+                if attempt >= retry_count:
+                    # Last attempt failed
+                    logger.error(f"❌ All {retry_count + 1} attempts failed. Last error: {error_code}")
+                    raise
+                # Otherwise, continue to next retry
+            
+            except Exception as e:
+                # Other errors - don't retry, raise immediately
+                logger.error(f"Edge TTS error (non-retryable): {type(e).__name__} - {e}")
+                raise
+            
+            finally:
+                # Restore original function if we patched it
+                if not _SSL_VERIFY_ENABLED:
+                    ssl.create_default_context = original_create_default_context
+        
+        # Should not reach here, but just in case
+        if last_error:
+            raise last_error
+        else:
+            raise RuntimeError("Edge TTS failed without error (unexpected)")
+
+
+def get_audio_duration(audio_path: str) -> float:
+    """
+    Get audio file duration in seconds
+    
+    Args:
+        audio_path: Path to audio file
+    
+    Returns:
+        Duration in seconds
+    """
+    try:
+        # Try using ffmpeg-python
+        import ffmpeg
+        probe = ffmpeg.probe(audio_path)
+        duration = float(probe['format']['duration'])
+        return duration
+    except Exception as e:
+        logger.warning(f"Failed to get audio duration: {e}, using estimate")
+        # Fallback: estimate based on file size (very rough)
+        import os
+        file_size = os.path.getsize(audio_path)
+        # Assume ~16kbps for MP3, so 2KB per second
+        estimated_duration = file_size / 2000
+        return max(1.0, estimated_duration)  # At least 1 second
+
+
+async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry_base_delay: float = _RETRY_BASE_DELAY) -> list[str]:
+    """
+    List all available voices for Edge TTS
+    
+    Returns a list of voice IDs (ShortName).
+    Optionally filter by locale.
+    
+    Includes automatic retry mechanism with exponential backoff and jitter
+    to handle network errors and rate limiting.
+    
+    Args:
+        locale: Filter by locale (e.g., zh-CN, en-US, ja-JP)
+        retry_count: Number of retries on failure (default: 5)
+        retry_base_delay: Base delay for exponential backoff (default: 1.0s)
+    
+    Returns:
+        List of voice IDs
+    
+    Example:
+        # List all voices
+        voices = await list_voices()
+        # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
+        
+        # List Chinese voices only
+        voices = await list_voices(locale="zh-CN")
+        # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
+    """
+    logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")
+    
+    # Use semaphore to limit concurrent requests
+    async with _request_semaphore:
+        # Add a small random delay before each request to avoid rate limiting
+        pre_delay = _REQUEST_DELAY + random.uniform(0, 0.3)
+        logger.debug(f"Waiting {pre_delay:.2f}s before request (rate limiting)")
+        await asyncio.sleep(pre_delay)
+        
+        last_error = None
+        
+        # Retry loop
+        for attempt in range(retry_count + 1):
+            if attempt > 0:
+                # Exponential backoff with jitter
+                exponential_delay = retry_base_delay * (2 ** (attempt - 1))
+                jitter = random.uniform(0, retry_base_delay)
+                retry_delay = min(exponential_delay + jitter, _MAX_RETRY_DELAY)
+                
+                logger.info(f"🔄 Retrying list voices (attempt {attempt + 1}/{retry_count + 1}) after {retry_delay:.2f}s delay...")
+                await asyncio.sleep(retry_delay)
+            
+            # Monkey patch SSL if verification is disabled
+            if not _SSL_VERIFY_ENABLED:
+                if attempt == 0:  # Only log warning once
+                    logger.warning("SSL verification is disabled for development. This is NOT recommended for production!")
+                original_create_default_context = ssl.create_default_context
+                
+                def create_unverified_context(*args, **kwargs):
+                    ctx = original_create_default_context(*args, **kwargs)
+                    ctx.check_hostname = False
+                    ctx.verify_mode = ssl.CERT_NONE
+                    return ctx
+                
+                ssl.create_default_context = create_unverified_context
+            
+            try:
+                # Get all voices
+                voices = await edge_tts_sdk.list_voices()
+                
+                # Filter by locale if specified
+                if locale:
+                    voices = [v for v in voices if v["Locale"].startswith(locale)]
+                
+                # Extract voice IDs (ShortName)
+                voice_ids = [voice["ShortName"] for voice in voices]
+                
+                if attempt > 0:
+                    logger.success(f"✅ Retry succeeded on attempt {attempt + 1}")
+                
+                logger.info(f"Found {len(voice_ids)} voices" + (f" for locale '{locale}'" if locale else ""))
+                return voice_ids
+            
+            except (WSServerHandshakeError, ClientResponseError) as e:
+                # Network/authentication errors - retry
+                last_error = e
+                error_code = getattr(e, 'status', 'unknown')
+                error_msg = str(e)
+                
+                # Log more detailed information for 401 errors
+                if error_code == 401 or '401' in error_msg:
+                    logger.warning(f"⚠️  Edge TTS 401 Authentication Error (list_voices attempt {attempt + 1}/{retry_count + 1})")
+                    logger.debug(f"Error details: {error_msg}")
+                    logger.debug(f"This is usually caused by rate limiting. Will retry with exponential backoff...")
+                else:
+                    logger.warning(f"⚠️  List voices error (attempt {attempt + 1}/{retry_count + 1}): {error_code} - {e}")
+                
+                if attempt >= retry_count:
+                    logger.error(f"❌ All {retry_count + 1} attempts failed. Last error: {error_code}")
+                    raise
+            
+            except Exception as e:
+                # Other errors - don't retry, raise immediately
+                logger.error(f"List voices error (non-retryable): {type(e).__name__} - {e}")
+                raise
+            
+            finally:
+                # Restore original function if we patched it
+                if not _SSL_VERIFY_ENABLED:
+                    ssl.create_default_context = original_create_default_context
+        
+        # Should not reach here, but just in case
+        if last_error:
+            raise last_error
+        else:
+            raise RuntimeError("List voices failed without error (unexpected)")
+