diff --git a/api/routers/tts.py b/api/routers/tts.py index 9660468..4ad747e 100644 --- a/api/routers/tts.py +++ b/api/routers/tts.py @@ -23,7 +23,7 @@ async def tts_synthesize( Convert text to speech audio. - **text**: Text to synthesize - - **voice_id**: Voice ID (e.g., 'zh-CN-YunjianNeural', 'en-US-AriaNeural') + - **voice_id**: Voice ID (e.g., '[Chinese] zh-CN Yunjian', '[English] en-US Aria') Returns path to generated audio file and duration. """ diff --git a/api/schemas/tts.py b/api/schemas/tts.py index 92bf98d..de41df8 100644 --- a/api/schemas/tts.py +++ b/api/schemas/tts.py @@ -8,13 +8,13 @@ from pydantic import BaseModel, Field class TTSSynthesizeRequest(BaseModel): """TTS synthesis request""" text: str = Field(..., description="Text to synthesize") - voice_id: str = Field("zh-CN-YunjianNeural", description="Voice ID") + voice_id: str = Field("[Chinese] zh-CN Yunjian", description="Voice ID") class Config: json_schema_extra = { "example": { "text": "Hello, welcome to ReelForge!", - "voice_id": "zh-CN-YunjianNeural" + "voice_id": "[Chinese] zh-CN Yunjian" } } diff --git a/api/schemas/video.py b/api/schemas/video.py index d98e66c..39ad8e3 100644 --- a/api/schemas/video.py +++ b/api/schemas/video.py @@ -23,7 +23,7 @@ class VideoGenerateRequest(BaseModel): # === Basic Config === n_scenes: int = Field(5, ge=1, le=20, description="Number of scenes (generate mode only)") - voice_id: str = Field("zh-CN-YunjianNeural", description="TTS voice ID") + voice_id: str = Field("[Chinese] zh-CN Yunjian", description="TTS voice ID") # === LLM Parameters === min_narration_words: int = Field(5, ge=1, le=100, description="Min narration words") @@ -57,7 +57,7 @@ class VideoGenerateRequest(BaseModel): "text": "Atomic Habits teaches us that small changes compound over time to produce remarkable results.", "mode": "generate", "n_scenes": 5, - "voice_id": "zh-CN-YunjianNeural", + "voice_id": "[Chinese] zh-CN Yunjian", "title": "The Power of Atomic Habits" } } diff --git a/config.example.yaml b/config.example.yaml index 0826440..b8661b6 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -25,7 +25,7 @@ comfyui: # TTS-specific configuration tts: - default: selfhost/tts_edge.json # TTS workflow to use + default_workflow: selfhost/tts_edge.json # TTS workflow to use # Image-specific configuration image: diff --git a/reelforge/config/schema.py b/reelforge/config/schema.py index 7a61900..c0f6683 100644 --- a/reelforge/config/schema.py +++ b/reelforge/config/schema.py @@ -15,16 +15,12 @@ class LLMConfig(BaseModel): class TTSSubConfig(BaseModel): """TTS-specific configuration (under comfyui.tts)""" - model_config = {"populate_by_name": True} # Allow both field name and alias - - default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)", alias="default") + default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)") class ImageSubConfig(BaseModel): """Image-specific configuration (under comfyui.image)""" - model_config = {"populate_by_name": True} # Allow both field name and alias - - default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)", alias="default") + default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)") prompt_prefix: str = Field( default="Pure white background, minimalist illustration, matchstick figure style, black and white line drawing, simple clean lines", description="Prompt prefix for all image generation" diff --git a/reelforge/models/storyboard.py b/reelforge/models/storyboard.py index 0420683..e483769 100644 --- a/reelforge/models/storyboard.py +++ b/reelforge/models/storyboard.py @@ -26,7 +26,9 @@ class StoryboardConfig: video_fps: int = 30 # Frame rate # Audio parameters - voice_id: str = "zh-CN-YunjianNeural" # Default voice + voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice + tts_workflow: Optional[str] = None # TTS workflow filename (None = use default) + tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster) # Image parameters image_width: int = 1024 diff --git a/reelforge/services/frame_processor.py b/reelforge/services/frame_processor.py index c61786e..ed249f0 100644 --- a/reelforge/services/frame_processor.py +++ b/reelforge/services/frame_processor.py @@ -124,11 +124,12 @@ class FrameProcessor: from reelforge.utils.os_util import get_task_frame_path output_path = get_task_frame_path(config.task_id, frame.index, "audio") - # Call TTS with specific output path + # Call TTS with specific output path and workflow audio_path = await self.core.tts( text=frame.narration, + workflow=config.tts_workflow, # Use workflow from config voice=config.voice_id, - rate="+20%", + speed=config.tts_speed, # Use speed (not rate) from config output_path=output_path, ) diff --git a/reelforge/services/image_prompt_generator.py b/reelforge/services/image_prompt_generator.py index 786d0e3..d8a8b02 100644 --- a/reelforge/services/image_prompt_generator.py +++ b/reelforge/services/image_prompt_generator.py @@ -116,8 +116,8 @@ class ImagePromptGeneratorService: # 5. Apply prompt prefix to each prompt from reelforge.utils.prompt_helper import build_image_prompt - # Get prompt prefix from config - image_config = self.core.config.get("image", {}) + # Get prompt prefix from config (fix: correct path is comfyui.image.prompt_prefix) + image_config = self.core.config.get("comfyui", {}).get("image", {}) prompt_prefix = image_config.get("prompt_prefix", "") # Apply prefix to each base prompt diff --git a/reelforge/services/tts_service.py b/reelforge/services/tts_service.py index 6f4d11f..2e21993 100644 --- a/reelforge/services/tts_service.py +++ b/reelforge/services/tts_service.py @@ -52,8 +52,8 @@ class TTSService(ComfyBaseService): comfyui_url: Optional[str] = None, runninghub_api_key: Optional[str] = None, # TTS parameters - voice: Optional[str] = None, - speed: float = 1.0, + voice: str = "[Chinese] zh-CN Yunjian", + speed: float = 1.2, # Output path output_path: Optional[str] = None, **params @@ -88,7 +88,7 @@ class TTSService(ComfyBaseService): audio_path = await reelforge.tts( text="Hello", workflow="tts_edge.json", - voice="zh-CN-XiaoxiaoNeural", + voice="[Chinese] zh-CN Xiaoxiao", speed=1.2 ) diff --git a/reelforge/services/video_generator.py b/reelforge/services/video_generator.py index f017b30..74e1a0c 100644 --- a/reelforge/services/video_generator.py +++ b/reelforge/services/video_generator.py @@ -54,7 +54,9 @@ class VideoGeneratorService: # === Basic Config === n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode - voice_id: str = "zh-CN-YunjianNeural", + voice_id: str = "[Chinese] zh-CN Yunjian", + tts_workflow: Optional[str] = None, + tts_speed: float = 1.2, output_path: Optional[str] = None, # === LLM Parameters === @@ -111,7 +113,9 @@ class VideoGeneratorService: n_scenes: Number of storyboard scenes (default 5) Only effective in generate mode; ignored in fixed mode - voice_id: TTS voice ID (default "zh-CN-YunjianNeural") + voice_id: TTS voice ID (default "[Chinese] zh-CN Yunjian") + tts_workflow: TTS workflow filename (e.g., "tts_edge.json", None = use default) + tts_speed: TTS speed multiplier (1.0 = normal, 1.2 = 20% faster, default 1.2) output_path: Output video path (auto-generated if None) min_narration_words: Min narration length (generate mode only) @@ -219,6 +223,8 @@ class VideoGeneratorService: video_height=video_height, video_fps=video_fps, voice_id=voice_id, + tts_workflow=tts_workflow, + tts_speed=tts_speed, image_width=image_width, image_height=image_height, image_workflow=image_workflow, @@ -259,7 +265,8 @@ class VideoGeneratorService: # Override prompt_prefix if provided (temporarily modify config) original_prefix = None if prompt_prefix is not None: - image_config = self.core.config.get("image", {}) + # Fix: image config is under comfyui.image, not directly under config + image_config = self.core.config.get("comfyui", {}).get("image", {}) original_prefix = image_config.get("prompt_prefix") image_config["prompt_prefix"] = prompt_prefix logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'") diff --git a/reelforge/utils/tts_util.py b/reelforge/utils/tts_util.py index 8280b57..f69ca71 100644 --- a/reelforge/utils/tts_util.py +++ b/reelforge/utils/tts_util.py @@ -31,7 +31,7 @@ _request_semaphore = asyncio.Semaphore(_MAX_CONCURRENT_REQUESTS) async def edge_tts( text: str, - voice: str = "zh-CN-YunjianNeural", + voice: str = "[Chinese] zh-CN Yunjian", rate: str = "+0%", volume: str = "+0%", pitch: str = "+0Hz", @@ -53,7 +53,7 @@ async def edge_tts( Args: text: Text to convert to speech - voice: Voice ID (e.g., zh-CN-YunjianNeural, en-US-JennyNeural) + voice: Voice ID (e.g., [Chinese] zh-CN Yunjian, [English] en-US Jenny) rate: Speech rate (e.g., +0%, +50%, -20%) volume: Speech volume (e.g., +0%, +50%, -20%) pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz) @@ -65,20 +65,20 @@ async def edge_tts( Audio data as bytes (MP3 format) Popular Chinese voices: - - zh-CN-YunjianNeural (male, default) - - zh-CN-XiaoxiaoNeural (female) - - zh-CN-YunxiNeural (male) - - zh-CN-XiaoyiNeural (female) + - [Chinese] zh-CN Yunjian (male, default) + - [Chinese] zh-CN Xiaoxiao (female) + - [Chinese] zh-CN Yunxi (male) + - [Chinese] zh-CN Xiaoyi (female) Popular English voices: - - en-US-JennyNeural (female) - - en-US-GuyNeural (male) - - en-GB-SoniaNeural (female, British) + - [English] en-US Jenny (female) + - [English] en-US Guy (male) + - [English] en-GB Sonia (female, British) Example: audio_bytes = await edge_tts( text="你好,世界!", - voice="zh-CN-YunjianNeural", + voice="[Chinese] zh-CN Yunjian", rate="+20%" ) """ @@ -235,11 +235,11 @@ async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry Example: # List all voices voices = await list_voices() - # Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...] + # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...] # List Chinese voices only voices = await list_voices(locale="zh-CN") - # Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...] + # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...] """ logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}") diff --git a/web/app.py b/web/app.py index 23bfc3e..9c1bbb8 100644 --- a/web/app.py +++ b/web/app.py @@ -459,7 +459,7 @@ def main(): else: tts_workflow_key = "selfhost/tts_edge.json" # fallback - # TTS preview expander (similar to image preview) + # TTS preview expander (simplified, uses default voice and speed) with st.expander(tr("tts.preview_title"), expanded=False): # Preview text input preview_text = st.text_input( @@ -473,7 +473,7 @@ def main(): if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True): with st.spinner(tr("tts.previewing")): try: - # Generate preview audio using selected workflow + # Generate preview audio using selected workflow (use default voice and speed) audio_path = run_async(reelforge.tts( text=preview_text, workflow=tts_workflow_key