更新TTS相关配置，调整语音ID格式，优化工作流参数，确保一致性和可读性。

2025-10-30 00:06:23 +08:00
parent fb18adf318
commit f7ad45354e
12 changed files with 43 additions and 37 deletions
--- a/api/routers/tts.py
+++ b/api/routers/tts.py
@@ -23,7 +23,7 @@ async def tts_synthesize(
    Convert text to speech audio.
    
    - **text**: Text to synthesize
-    - **voice_id**: Voice ID (e.g., 'zh-CN-YunjianNeural', 'en-US-AriaNeural')
+    - **voice_id**: Voice ID (e.g., '[Chinese] zh-CN Yunjian', '[English] en-US Aria')
    
    Returns path to generated audio file and duration.
    """
--- a/api/schemas/tts.py
+++ b/api/schemas/tts.py
@@ -8,13 +8,13 @@ from pydantic import BaseModel, Field
 class TTSSynthesizeRequest(BaseModel):
    """TTS synthesis request"""
    text: str = Field(..., description="Text to synthesize")
-    voice_id: str = Field("zh-CN-YunjianNeural", description="Voice ID")
+    voice_id: str = Field("[Chinese] zh-CN Yunjian", description="Voice ID")
    
    class Config:
        json_schema_extra = {
            "example": {
                "text": "Hello, welcome to ReelForge!",
-                "voice_id": "zh-CN-YunjianNeural"
+                "voice_id": "[Chinese] zh-CN Yunjian"
            }
        }

--- a/api/schemas/video.py
+++ b/api/schemas/video.py
@@ -23,7 +23,7 @@ class VideoGenerateRequest(BaseModel):
    
    # === Basic Config ===
    n_scenes: int = Field(5, ge=1, le=20, description="Number of scenes (generate mode only)")
-    voice_id: str = Field("zh-CN-YunjianNeural", description="TTS voice ID")
+    voice_id: str = Field("[Chinese] zh-CN Yunjian", description="TTS voice ID")
    
    # === LLM Parameters ===
    min_narration_words: int = Field(5, ge=1, le=100, description="Min narration words")
@@ -57,7 +57,7 @@ class VideoGenerateRequest(BaseModel):
                "text": "Atomic Habits teaches us that small changes compound over time to produce remarkable results.",
                "mode": "generate",
                "n_scenes": 5,
-                "voice_id": "zh-CN-YunjianNeural",
+                "voice_id": "[Chinese] zh-CN Yunjian",
                "title": "The Power of Atomic Habits"
            }
        }
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -25,7 +25,7 @@ comfyui:
  
  # TTS-specific configuration
  tts:
-    default: selfhost/tts_edge.json  # TTS workflow to use
+    default_workflow: selfhost/tts_edge.json  # TTS workflow to use
  
  # Image-specific configuration
  image:
--- a/reelforge/config/schema.py
+++ b/reelforge/config/schema.py
@@ -15,16 +15,12 @@ class LLMConfig(BaseModel):

 class TTSSubConfig(BaseModel):
    """TTS-specific configuration (under comfyui.tts)"""
-    model_config = {"populate_by_name": True}  # Allow both field name and alias
-    
-    default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)", alias="default")
+    default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)")


 class ImageSubConfig(BaseModel):
    """Image-specific configuration (under comfyui.image)"""
-    model_config = {"populate_by_name": True}  # Allow both field name and alias
-    
-    default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)", alias="default")
+    default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)")
    prompt_prefix: str = Field(
        default="Pure white background, minimalist illustration, matchstick figure style, black and white line drawing, simple clean lines",
        description="Prompt prefix for all image generation"
--- a/reelforge/models/storyboard.py
+++ b/reelforge/models/storyboard.py
@@ -26,7 +26,9 @@ class StoryboardConfig:
    video_fps: int = 30                        # Frame rate
    
    # Audio parameters
-    voice_id: str = "zh-CN-YunjianNeural"     # Default voice
+    voice_id: str = "[Chinese] zh-CN Yunjian"     # Default voice
+    tts_workflow: Optional[str] = None         # TTS workflow filename (None = use default)
+    tts_speed: float = 1.2                     # TTS speed multiplier (1.0 = normal, >1.0 = faster)
    
    # Image parameters
    image_width: int = 1024
--- a/reelforge/services/frame_processor.py
+++ b/reelforge/services/frame_processor.py
@@ -124,11 +124,12 @@ class FrameProcessor:
        from reelforge.utils.os_util import get_task_frame_path
        output_path = get_task_frame_path(config.task_id, frame.index, "audio")
        
-        # Call TTS with specific output path
+        # Call TTS with specific output path and workflow
        audio_path = await self.core.tts(
            text=frame.narration,
+            workflow=config.tts_workflow,  # Use workflow from config
            voice=config.voice_id,
-            rate="+20%",
+            speed=config.tts_speed,  # Use speed (not rate) from config
            output_path=output_path,
        )
        
--- a/reelforge/services/image_prompt_generator.py
+++ b/reelforge/services/image_prompt_generator.py
@@ -116,8 +116,8 @@ class ImagePromptGeneratorService:
        # 5. Apply prompt prefix to each prompt
        from reelforge.utils.prompt_helper import build_image_prompt
        
-        # Get prompt prefix from config
-        image_config = self.core.config.get("image", {})
+        # Get prompt prefix from config (fix: correct path is comfyui.image.prompt_prefix)
+        image_config = self.core.config.get("comfyui", {}).get("image", {})
        prompt_prefix = image_config.get("prompt_prefix", "")
        
        # Apply prefix to each base prompt
--- a/reelforge/services/tts_service.py
+++ b/reelforge/services/tts_service.py
@@ -52,8 +52,8 @@ class TTSService(ComfyBaseService):
        comfyui_url: Optional[str] = None,
        runninghub_api_key: Optional[str] = None,
        # TTS parameters
-        voice: Optional[str] = None,
-        speed: float = 1.0,
+        voice: str = "[Chinese] zh-CN Yunjian",
+        speed: float = 1.2,
        # Output path
        output_path: Optional[str] = None,
        **params
@@ -88,7 +88,7 @@ class TTSService(ComfyBaseService):
            audio_path = await reelforge.tts(
                text="Hello",
                workflow="tts_edge.json",
-                voice="zh-CN-XiaoxiaoNeural",
+                voice="[Chinese] zh-CN Xiaoxiao",
                speed=1.2
            )
            
--- a/reelforge/services/video_generator.py
+++ b/reelforge/services/video_generator.py
@@ -54,7 +54,9 @@ class VideoGeneratorService:
        
        # === Basic Config ===
        n_scenes: int = 5,  # Only used in generate mode; ignored in fixed mode
-        voice_id: str = "zh-CN-YunjianNeural",
+        voice_id: str = "[Chinese] zh-CN Yunjian",
+        tts_workflow: Optional[str] = None,
+        tts_speed: float = 1.2,
        output_path: Optional[str] = None,
        
        # === LLM Parameters ===
@@ -111,7 +113,9 @@ class VideoGeneratorService:
            n_scenes: Number of storyboard scenes (default 5)
                      Only effective in generate mode; ignored in fixed mode
            
-            voice_id: TTS voice ID (default "zh-CN-YunjianNeural")
+            voice_id: TTS voice ID (default "[Chinese] zh-CN Yunjian")
+            tts_workflow: TTS workflow filename (e.g., "tts_edge.json", None = use default)
+            tts_speed: TTS speed multiplier (1.0 = normal, 1.2 = 20% faster, default 1.2)
            output_path: Output video path (auto-generated if None)
            
            min_narration_words: Min narration length (generate mode only)
@@ -219,6 +223,8 @@ class VideoGeneratorService:
            video_height=video_height,
            video_fps=video_fps,
            voice_id=voice_id,
+            tts_workflow=tts_workflow,
+            tts_speed=tts_speed,
            image_width=image_width,
            image_height=image_height,
            image_workflow=image_workflow,
@@ -259,7 +265,8 @@ class VideoGeneratorService:
            # Override prompt_prefix if provided (temporarily modify config)
            original_prefix = None
            if prompt_prefix is not None:
-                image_config = self.core.config.get("image", {})
+                # Fix: image config is under comfyui.image, not directly under config
+                image_config = self.core.config.get("comfyui", {}).get("image", {})
                original_prefix = image_config.get("prompt_prefix")
                image_config["prompt_prefix"] = prompt_prefix
                logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")
--- a/reelforge/utils/tts_util.py
+++ b/reelforge/utils/tts_util.py
@@ -31,7 +31,7 @@ _request_semaphore = asyncio.Semaphore(_MAX_CONCURRENT_REQUESTS)

 async def edge_tts(
    text: str,
-    voice: str = "zh-CN-YunjianNeural",
+    voice: str = "[Chinese] zh-CN Yunjian",
    rate: str = "+0%",
    volume: str = "+0%",
    pitch: str = "+0Hz",
@@ -53,7 +53,7 @@ async def edge_tts(
    
    Args:
        text: Text to convert to speech
-        voice: Voice ID (e.g., zh-CN-YunjianNeural, en-US-JennyNeural)
+        voice: Voice ID (e.g., [Chinese] zh-CN Yunjian, [English] en-US Jenny)
        rate: Speech rate (e.g., +0%, +50%, -20%)
        volume: Speech volume (e.g., +0%, +50%, -20%)
        pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
@@ -65,20 +65,20 @@ async def edge_tts(
        Audio data as bytes (MP3 format)
    
    Popular Chinese voices:
-    - zh-CN-YunjianNeural (male, default)
-    - zh-CN-XiaoxiaoNeural (female)
-    - zh-CN-YunxiNeural (male)
-    - zh-CN-XiaoyiNeural (female)
+    - [Chinese] zh-CN Yunjian (male, default)
+    - [Chinese] zh-CN Xiaoxiao (female)
+    - [Chinese] zh-CN Yunxi (male)
+    - [Chinese] zh-CN Xiaoyi (female)
    
    Popular English voices:
-    - en-US-JennyNeural (female)
-    - en-US-GuyNeural (male)
-    - en-GB-SoniaNeural (female, British)
+    - [English] en-US Jenny (female)
+    - [English] en-US Guy (male)
+    - [English] en-GB Sonia (female, British)
    
    Example:
        audio_bytes = await edge_tts(
            text="你好，世界！",
-            voice="zh-CN-YunjianNeural",
+            voice="[Chinese] zh-CN Yunjian",
            rate="+20%"
        )
    """
@@ -235,11 +235,11 @@ async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry
    Example:
        # List all voices
        voices = await list_voices()
-        # Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...]
+        # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
        
        # List Chinese voices only
        voices = await list_voices(locale="zh-CN")
-        # Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...]
+        # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
    """
    logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")
    
--- a/web/app.py
+++ b/web/app.py
@@ -459,7 +459,7 @@ def main():
            else:
                tts_workflow_key = "selfhost/tts_edge.json"  # fallback
            
-            # TTS preview expander (similar to image preview)
+            # TTS preview expander (simplified, uses default voice and speed)
            with st.expander(tr("tts.preview_title"), expanded=False):
                # Preview text input
                preview_text = st.text_input(
@@ -473,7 +473,7 @@ def main():
                if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
                    with st.spinner(tr("tts.previewing")):
                        try:
-                            # Generate preview audio using selected workflow
+                            # Generate preview audio using selected workflow (use default voice and speed)
                            audio_path = run_async(reelforge.tts(
                                text=preview_text,
                                workflow=tts_workflow_key