tts支持本地合成

2025-11-06 21:06:14 +08:00
parent 56b6b74af7
commit 393cdb8f0a
9 changed files with 531 additions and 112 deletions
--- a/pixelle_video/config/schema.py
+++ b/pixelle_video/config/schema.py
@@ -14,9 +14,28 @@ class LLMConfig(BaseModel):
    model: str = Field(default="", description="LLM Model Name")


+class TTSLocalConfig(BaseModel):
+    """Local TTS configuration (Edge TTS)"""
+    voice: str = Field(default="zh-CN-YunjianNeural", description="Edge TTS voice ID")
+    speed: float = Field(default=1.2, ge=0.5, le=2.0, description="Speech speed multiplier (0.5-2.0)")
+
+
+class TTSComfyUIConfig(BaseModel):
+    """ComfyUI TTS configuration"""
+    default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
+
+
 class TTSSubConfig(BaseModel):
    """TTS-specific configuration (under comfyui.tts)"""
-    default_workflow: Optional[str] = Field(default=None, description="Default TTS workflow (optional)")
+    inference_mode: str = Field(default="local", description="TTS inference mode: 'local' or 'comfyui'")
+    local: TTSLocalConfig = Field(default_factory=TTSLocalConfig, description="Local TTS (Edge TTS) configuration")
+    comfyui: TTSComfyUIConfig = Field(default_factory=TTSComfyUIConfig, description="ComfyUI TTS configuration")
+    
+    # Backward compatibility: keep default_workflow at top level
+    @property
+    def default_workflow(self) -> Optional[str]:
+        """Get default workflow (for backward compatibility)"""
+        return self.comfyui.default_workflow


 class ImageSubConfig(BaseModel):
--- a/pixelle_video/models/storyboard.py
+++ b/pixelle_video/models/storyboard.py
@@ -24,10 +24,11 @@ class StoryboardConfig:
    video_fps: int = 30                        # Frame rate
    
    # Audio parameters
-    voice_id: str = "[Chinese] zh-CN Yunjian"     # Default voice
-    tts_workflow: Optional[str] = None         # TTS workflow filename (None = use default)
-    tts_speed: float = 1.2                     # TTS speed multiplier (1.0 = normal, >1.0 = faster)
-    ref_audio: Optional[str] = None            # Reference audio for voice cloning (only some workflows support this)
+    tts_inference_mode: str = "local"          # TTS inference mode: "local" or "comfyui"
+    voice_id: Optional[str] = None             # Voice ID (for local: Edge TTS voice ID; for comfyui: workflow-specific)
+    tts_workflow: Optional[str] = None         # TTS workflow filename (for ComfyUI mode, None = use default)
+    tts_speed: Optional[float] = None          # TTS speed multiplier (0.5-2.0, 1.0 = normal)
+    ref_audio: Optional[str] = None            # Reference audio for voice cloning (ComfyUI mode only)
    
    # Image parameters
    image_width: int = 1024
--- a/pixelle_video/pipelines/standard.py
+++ b/pixelle_video/pipelines/standard.py
@@ -62,10 +62,17 @@ class StandardPipeline(BasePipeline):
        
        # === Basic Config ===
        n_scenes: int = 5,  # Only used in generate mode; ignored in fixed mode
-        voice_id: str = "[Chinese] zh-CN Yunjian",
-        tts_workflow: Optional[str] = None,
-        tts_speed: float = 1.2,
-        ref_audio: Optional[str] = None,  # Reference audio for voice cloning
+        
+        # === TTS Parameters ===
+        tts_inference_mode: Optional[str] = None,  # "local" or "comfyui"
+        tts_voice: Optional[str] = None,  # For local mode: Edge TTS voice ID
+        tts_speed: Optional[float] = None,  # Speed multiplier (0.5-2.0)
+        tts_workflow: Optional[str] = None,  # For ComfyUI mode: workflow path
+        ref_audio: Optional[str] = None,  # For ComfyUI mode: reference audio
+        
+        # Deprecated (kept for backward compatibility)
+        voice_id: Optional[str] = None,
+        
        output_path: Optional[str] = None,
        
        # === LLM Parameters ===
@@ -191,6 +198,29 @@ class StandardPipeline(BasePipeline):
            output_path = get_task_final_video_path(task_id)
            logger.info(f"   Will copy final video to: {user_specified_output}")
        
+        # Determine TTS inference mode and parameters
+        # Priority: explicit params > backward compatibility > config defaults
+        if tts_inference_mode is None:
+            # Check if user provided ComfyUI-specific params
+            if tts_workflow is not None or ref_audio is not None:
+                tts_inference_mode = "comfyui"
+            # Check if user provided old voice_id param (backward compatibility)
+            elif voice_id is not None:
+                tts_inference_mode = "comfyui"
+                if tts_voice is None:
+                    tts_voice = voice_id
+            else:
+                # Use config default
+                tts_config = self.core.config.get("comfyui", {}).get("tts", {})
+                tts_inference_mode = tts_config.get("inference_mode", "local")
+        
+        # Set voice_id based on mode for StoryboardConfig
+        final_voice_id = None
+        if tts_inference_mode == "local":
+            final_voice_id = tts_voice or voice_id
+        else:  # comfyui
+            final_voice_id = voice_id  # For ComfyUI, might be None
+        
        # Create storyboard config
        config = StoryboardConfig(
            task_id=task_id,
@@ -200,7 +230,8 @@ class StandardPipeline(BasePipeline):
            min_image_prompt_words=min_image_prompt_words,
            max_image_prompt_words=max_image_prompt_words,
            video_fps=video_fps,
-            voice_id=voice_id,
+            tts_inference_mode=tts_inference_mode,
+            voice_id=final_voice_id,
            tts_workflow=tts_workflow,
            tts_speed=tts_speed,
            ref_audio=ref_audio,
--- a/pixelle_video/services/frame_processor.py
+++ b/pixelle_video/services/frame_processor.py
@@ -124,18 +124,29 @@ class FrameProcessor:
        from pixelle_video.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "audio")
        
-        # Call TTS with specific output path and workflow
+        # Build TTS params based on inference mode
        tts_params = {
            "text": frame.narration,
-            "workflow": config.tts_workflow,
-            "voice": config.voice_id,
-            "speed": config.tts_speed,
+            "inference_mode": config.tts_inference_mode,
            "output_path": output_path,
        }
        
-        # Add ref_audio if provided
-        if config.ref_audio:
-            tts_params["ref_audio"] = config.ref_audio
+        if config.tts_inference_mode == "local":
+            # Local mode: pass voice and speed
+            if config.voice_id:
+                tts_params["voice"] = config.voice_id
+            if config.tts_speed is not None:
+                tts_params["speed"] = config.tts_speed
+        else:  # comfyui
+            # ComfyUI mode: pass workflow, voice, speed, and ref_audio
+            if config.tts_workflow:
+                tts_params["workflow"] = config.tts_workflow
+            if config.voice_id:
+                tts_params["voice"] = config.voice_id
+            if config.tts_speed is not None:
+                tts_params["speed"] = config.tts_speed
+            if config.ref_audio:
+                tts_params["ref_audio"] = config.ref_audio
        
        audio_path = await self.core.tts(**tts_params)
        
--- a/pixelle_video/services/tts_service.py
+++ b/pixelle_video/services/tts_service.py
@@ -1,13 +1,18 @@
 """
-TTS (Text-to-Speech) Service - ComfyUI Workflow-based implementation
+TTS (Text-to-Speech) Service - Supports both local and ComfyUI inference
 """

+import os
+import uuid
+from pathlib import Path
 from typing import Optional

 from comfykit import ComfyKit
 from loguru import logger

 from pixelle_video.services.comfy_base_service import ComfyBaseService
+from pixelle_video.utils.tts_util import edge_tts
+from pixelle_video.tts_voices import speed_to_rate


 class TTSService(ComfyBaseService):
@@ -52,22 +57,25 @@ class TTSService(ComfyBaseService):
        comfyui_url: Optional[str] = None,
        runninghub_api_key: Optional[str] = None,
        # TTS parameters
-        voice: str = "[Chinese] zh-CN Yunjian",
-        speed: float = 1.2,
+        voice: Optional[str] = None,
+        speed: Optional[float] = None,
+        # Inference mode override
+        inference_mode: Optional[str] = None,
        # Output path
        output_path: Optional[str] = None,
        **params
    ) -> str:
        """
-        Generate speech using ComfyUI workflow
+        Generate speech using local Edge TTS or ComfyUI workflow
        
        Args:
            text: Text to convert to speech
-            workflow: Workflow filename (default: from config)
+            workflow: Workflow filename (for ComfyUI mode, default: from config)
            comfyui_url: ComfyUI URL (optional, overrides config)
            runninghub_api_key: RunningHub API key (optional, overrides config)
-            voice: Voice ID (workflow-specific)
+            voice: Voice ID (for local mode: Edge TTS voice ID; for ComfyUI: workflow-specific)
            speed: Speech speed multiplier (1.0 = normal, >1.0 = faster, <1.0 = slower)
+            inference_mode: Override inference mode ("local" or "comfyui", default: from config)
            output_path: Custom output path (auto-generated if None)
            **params: Additional workflow parameters
        
@@ -75,49 +83,103 @@ class TTSService(ComfyBaseService):
            Generated audio file path
        
        Examples:
-            # Simplest: use default workflow
-            audio_path = await pixelle_video.tts(text="Hello, world!")
-            
-            # Use specific workflow
+            # Local inference (Edge TTS)
            audio_path = await pixelle_video.tts(
-                text="你好，世界！",
-                workflow="tts_edge.json"
-            )
-            
-            # With voice and speed
-            audio_path = await pixelle_video.tts(
-                text="Hello",
-                workflow="tts_edge.json",
-                voice="[Chinese] zh-CN Xiaoxiao",
+                text="Hello, world!",
+                inference_mode="local",
+                voice="zh-CN-YunjianNeural",
                speed=1.2
            )
            
-            # With absolute path
+            # ComfyUI inference
            audio_path = await pixelle_video.tts(
-                text="Hello",
-                workflow="/path/to/custom_tts.json"
-            )
-            
-            # With custom ComfyUI server
-            audio_path = await pixelle_video.tts(
-                text="Hello",
-                comfyui_url="http://192.168.1.100:8188"
+                text="你好，世界！",
+                inference_mode="comfyui",
+                workflow="runninghub/tts_edge.json"
            )
        """
-        # 1. Resolve workflow (returns structured info)
-        workflow_info = self._resolve_workflow(workflow=workflow)
+        # Determine inference mode (param > config)
+        mode = inference_mode or self.config.get("inference_mode", "local")
        
-        # 2. Execute ComfyUI workflow
-        return await self._call_comfyui_workflow(
-            workflow_info=workflow_info,
-            text=text,
-            comfyui_url=comfyui_url,
-            runninghub_api_key=runninghub_api_key,
-            voice=voice,
-            speed=speed,
-            output_path=output_path,
-            **params
-        )
+        # Route to appropriate implementation
+        if mode == "local":
+            return await self._call_local_tts(
+                text=text,
+                voice=voice,
+                speed=speed,
+                output_path=output_path
+            )
+        else:  # comfyui
+            # 1. Resolve workflow (returns structured info)
+            workflow_info = self._resolve_workflow(workflow=workflow)
+            
+            # 2. Execute ComfyUI workflow
+            return await self._call_comfyui_workflow(
+                workflow_info=workflow_info,
+                text=text,
+                comfyui_url=comfyui_url,
+                runninghub_api_key=runninghub_api_key,
+                voice=voice,
+                speed=speed,
+                output_path=output_path,
+                **params
+            )
+    
+    async def _call_local_tts(
+        self,
+        text: str,
+        voice: Optional[str] = None,
+        speed: Optional[float] = None,
+        output_path: Optional[str] = None,
+    ) -> str:
+        """
+        Generate speech using local Edge TTS
+        
+        Args:
+            text: Text to convert to speech
+            voice: Edge TTS voice ID (default: from config)
+            speed: Speech speed multiplier (default: from config)
+            output_path: Custom output path (auto-generated if None)
+        
+        Returns:
+            Generated audio file path
+        """
+        # Get config defaults
+        local_config = self.config.get("local", {})
+        
+        # Determine voice and speed (param > config)
+        final_voice = voice or local_config.get("voice", "zh-CN-YunjianNeural")
+        final_speed = speed if speed is not None else local_config.get("speed", 1.2)
+        
+        # Convert speed to rate parameter
+        rate = speed_to_rate(final_speed)
+        
+        logger.info(f"🎙️  Using local Edge TTS: voice={final_voice}, speed={final_speed}x (rate={rate})")
+        
+        # Generate output path if not provided
+        if not output_path:
+            # Generate unique filename
+            unique_id = uuid.uuid4().hex
+            output_path = f"output/{unique_id}.mp3"
+            
+            # Ensure output directory exists
+            Path("output").mkdir(parents=True, exist_ok=True)
+        
+        # Call Edge TTS
+        try:
+            audio_bytes = await edge_tts(
+                text=text,
+                voice=final_voice,
+                rate=rate,
+                output_path=output_path
+            )
+            
+            logger.info(f"✅ Generated audio (local Edge TTS): {output_path}")
+            return output_path
+        
+        except Exception as e:
+            logger.error(f"Local TTS generation error: {e}")
+            raise
    
    async def _call_comfyui_workflow(
        self,
--- a/pixelle_video/tts_voices.py
+++ b/pixelle_video/tts_voices.py
@@ -0,0 +1,147 @@
+"""
+TTS Voice Configuration
+
+Defines available voices for local Edge TTS inference.
+"""
+
+from typing import List, Dict, Any
+
+
+# Edge TTS voice presets for local inference
+EDGE_TTS_VOICES: List[Dict[str, Any]] = [
+    # Chinese voices
+    {
+        "id": "zh-CN-XiaoxiaoNeural",
+        "label_key": "tts.voice.zh_CN_XiaoxiaoNeural",
+        "locale": "zh-CN",
+        "gender": "female"
+    },
+    {
+        "id": "zh-CN-XiaoyiNeural",
+        "label_key": "tts.voice.zh_CN_XiaoyiNeural",
+        "locale": "zh-CN",
+        "gender": "female"
+    },
+    {
+        "id": "zh-CN-YunjianNeural",
+        "label_key": "tts.voice.zh_CN_YunjianNeural",
+        "locale": "zh-CN",
+        "gender": "male"
+    },
+    {
+        "id": "zh-CN-YunxiNeural",
+        "label_key": "tts.voice.zh_CN_YunxiNeural",
+        "locale": "zh-CN",
+        "gender": "male"
+    },
+    {
+        "id": "zh-CN-YunyangNeural",
+        "label_key": "tts.voice.zh_CN_YunyangNeural",
+        "locale": "zh-CN",
+        "gender": "male"
+    },
+    {
+        "id": "zh-CN-YunyeNeural",
+        "label_key": "tts.voice.zh_CN_YunyeNeural",
+        "locale": "zh-CN",
+        "gender": "male"
+    },
+    {
+        "id": "zh-CN-YunfengNeural",
+        "label_key": "tts.voice.zh_CN_YunfengNeural",
+        "locale": "zh-CN",
+        "gender": "male"
+    },
+    {
+        "id": "zh-CN-liaoning-XiaobeiNeural",
+        "label_key": "tts.voice.zh_CN_liaoning_XiaobeiNeural",
+        "locale": "zh-CN",
+        "gender": "female"
+    },
+    
+    # English voices
+    {
+        "id": "en-US-AriaNeural",
+        "label_key": "tts.voice.en_US_AriaNeural",
+        "locale": "en-US",
+        "gender": "female"
+    },
+    {
+        "id": "en-US-JennyNeural",
+        "label_key": "tts.voice.en_US_JennyNeural",
+        "locale": "en-US",
+        "gender": "female"
+    },
+    {
+        "id": "en-US-GuyNeural",
+        "label_key": "tts.voice.en_US_GuyNeural",
+        "locale": "en-US",
+        "gender": "male"
+    },
+    {
+        "id": "en-US-DavisNeural",
+        "label_key": "tts.voice.en_US_DavisNeural",
+        "locale": "en-US",
+        "gender": "male"
+    },
+    {
+        "id": "en-GB-SoniaNeural",
+        "label_key": "tts.voice.en_GB_SoniaNeural",
+        "locale": "en-GB",
+        "gender": "female"
+    },
+    {
+        "id": "en-GB-RyanNeural",
+        "label_key": "tts.voice.en_GB_RyanNeural",
+        "locale": "en-GB",
+        "gender": "male"
+    },
+]
+
+
+def get_voice_display_name(voice_id: str, tr_func=None, locale: str = "zh_CN") -> str:
+    """
+    Get display name for voice
+    
+    Args:
+        voice_id: Voice ID (e.g., "zh-CN-YunjianNeural")
+        tr_func: Translation function (optional)
+        locale: Current locale (default: "zh_CN")
+    
+    Returns:
+        Display name (translated label if in Chinese, otherwise voice ID)
+    """
+    # Find voice config
+    voice_config = next((v for v in EDGE_TTS_VOICES if v["id"] == voice_id), None)
+    
+    if not voice_config:
+        return voice_id
+    
+    # If Chinese locale and translation function available, use translated label
+    if locale == "zh_CN" and tr_func:
+        label_key = voice_config["label_key"]
+        return tr_func(label_key)
+    
+    # For other locales, return voice ID
+    return voice_id
+
+
+def speed_to_rate(speed: float) -> str:
+    """
+    Convert speed multiplier to Edge TTS rate parameter
+    
+    Args:
+        speed: Speed multiplier (1.0 = normal, 1.2 = 120%)
+    
+    Returns:
+        Rate string (e.g., "+20%", "-10%")
+    
+    Examples:
+        1.0 → "+0%"
+        1.2 → "+20%"
+        0.8 → "-20%"
+    """
+    percentage = int((speed - 1.0) * 100)
+    sign = "+" if percentage >= 0 else ""
+    return f"{sign}{percentage}%"
+