更新TTS相关配置,调整语音ID格式,优化工作流参数,确保一致性和可读性。

This commit is contained in:
puke
2025-10-30 00:06:23 +08:00
parent fb18adf318
commit f7ad45354e
12 changed files with 43 additions and 37 deletions

View File

@@ -23,7 +23,7 @@ async def tts_synthesize(
Convert text to speech audio. Convert text to speech audio.
- **text**: Text to synthesize - **text**: Text to synthesize
- **voice_id**: Voice ID (e.g., 'zh-CN-YunjianNeural', 'en-US-AriaNeural') - **voice_id**: Voice ID (e.g., '[Chinese] zh-CN Yunjian', '[English] en-US Aria')
Returns path to generated audio file and duration. Returns path to generated audio file and duration.
""" """

View File

@@ -8,13 +8,13 @@ from pydantic import BaseModel, Field
class TTSSynthesizeRequest(BaseModel): class TTSSynthesizeRequest(BaseModel):
"""TTS synthesis request""" """TTS synthesis request"""
text: str = Field(..., description="Text to synthesize") text: str = Field(..., description="Text to synthesize")
voice_id: str = Field("zh-CN-YunjianNeural", description="Voice ID") voice_id: str = Field("[Chinese] zh-CN Yunjian", description="Voice ID")
class Config: class Config:
json_schema_extra = { json_schema_extra = {
"example": { "example": {
"text": "Hello, welcome to ReelForge!", "text": "Hello, welcome to ReelForge!",
"voice_id": "zh-CN-YunjianNeural" "voice_id": "[Chinese] zh-CN Yunjian"
} }
} }

View File

@@ -23,7 +23,7 @@ class VideoGenerateRequest(BaseModel):
# === Basic Config === # === Basic Config ===
n_scenes: int = Field(5, ge=1, le=20, description="Number of scenes (generate mode only)") n_scenes: int = Field(5, ge=1, le=20, description="Number of scenes (generate mode only)")
voice_id: str = Field("zh-CN-YunjianNeural", description="TTS voice ID") voice_id: str = Field("[Chinese] zh-CN Yunjian", description="TTS voice ID")
# === LLM Parameters === # === LLM Parameters ===
min_narration_words: int = Field(5, ge=1, le=100, description="Min narration words") min_narration_words: int = Field(5, ge=1, le=100, description="Min narration words")
@@ -57,7 +57,7 @@ class VideoGenerateRequest(BaseModel):
"text": "Atomic Habits teaches us that small changes compound over time to produce remarkable results.", "text": "Atomic Habits teaches us that small changes compound over time to produce remarkable results.",
"mode": "generate", "mode": "generate",
"n_scenes": 5, "n_scenes": 5,
"voice_id": "zh-CN-YunjianNeural", "voice_id": "[Chinese] zh-CN Yunjian",
"title": "The Power of Atomic Habits" "title": "The Power of Atomic Habits"
} }
} }

View File

@@ -25,7 +25,7 @@ comfyui:
# TTS-specific configuration # TTS-specific configuration
tts: tts:
default: selfhost/tts_edge.json # TTS workflow to use default_workflow: selfhost/tts_edge.json # TTS workflow to use
# Image-specific configuration # Image-specific configuration
image: image:

View File

@@ -15,16 +15,12 @@ class LLMConfig(BaseModel):
class TTSSubConfig(BaseModel): class TTSSubConfig(BaseModel):
"""TTS-specific configuration (under comfyui.tts)""" """TTS-specific configuration (under comfyui.tts)"""
model_config = {"populate_by_name": True} # Allow both field name and alias default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)")
default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)", alias="default")
class ImageSubConfig(BaseModel): class ImageSubConfig(BaseModel):
"""Image-specific configuration (under comfyui.image)""" """Image-specific configuration (under comfyui.image)"""
model_config = {"populate_by_name": True} # Allow both field name and alias default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)")
default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)", alias="default")
prompt_prefix: str = Field( prompt_prefix: str = Field(
default="Pure white background, minimalist illustration, matchstick figure style, black and white line drawing, simple clean lines", default="Pure white background, minimalist illustration, matchstick figure style, black and white line drawing, simple clean lines",
description="Prompt prefix for all image generation" description="Prompt prefix for all image generation"

View File

@@ -26,7 +26,9 @@ class StoryboardConfig:
video_fps: int = 30 # Frame rate video_fps: int = 30 # Frame rate
# Audio parameters # Audio parameters
voice_id: str = "zh-CN-YunjianNeural" # Default voice voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice
tts_workflow: Optional[str] = None # TTS workflow filename (None = use default)
tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster)
# Image parameters # Image parameters
image_width: int = 1024 image_width: int = 1024

View File

@@ -124,11 +124,12 @@ class FrameProcessor:
from reelforge.utils.os_util import get_task_frame_path from reelforge.utils.os_util import get_task_frame_path
output_path = get_task_frame_path(config.task_id, frame.index, "audio") output_path = get_task_frame_path(config.task_id, frame.index, "audio")
# Call TTS with specific output path # Call TTS with specific output path and workflow
audio_path = await self.core.tts( audio_path = await self.core.tts(
text=frame.narration, text=frame.narration,
workflow=config.tts_workflow, # Use workflow from config
voice=config.voice_id, voice=config.voice_id,
rate="+20%", speed=config.tts_speed, # Use speed (not rate) from config
output_path=output_path, output_path=output_path,
) )

View File

@@ -116,8 +116,8 @@ class ImagePromptGeneratorService:
# 5. Apply prompt prefix to each prompt # 5. Apply prompt prefix to each prompt
from reelforge.utils.prompt_helper import build_image_prompt from reelforge.utils.prompt_helper import build_image_prompt
# Get prompt prefix from config # Get prompt prefix from config (fix: correct path is comfyui.image.prompt_prefix)
image_config = self.core.config.get("image", {}) image_config = self.core.config.get("comfyui", {}).get("image", {})
prompt_prefix = image_config.get("prompt_prefix", "") prompt_prefix = image_config.get("prompt_prefix", "")
# Apply prefix to each base prompt # Apply prefix to each base prompt

View File

@@ -52,8 +52,8 @@ class TTSService(ComfyBaseService):
comfyui_url: Optional[str] = None, comfyui_url: Optional[str] = None,
runninghub_api_key: Optional[str] = None, runninghub_api_key: Optional[str] = None,
# TTS parameters # TTS parameters
voice: Optional[str] = None, voice: str = "[Chinese] zh-CN Yunjian",
speed: float = 1.0, speed: float = 1.2,
# Output path # Output path
output_path: Optional[str] = None, output_path: Optional[str] = None,
**params **params
@@ -88,7 +88,7 @@ class TTSService(ComfyBaseService):
audio_path = await reelforge.tts( audio_path = await reelforge.tts(
text="Hello", text="Hello",
workflow="tts_edge.json", workflow="tts_edge.json",
voice="zh-CN-XiaoxiaoNeural", voice="[Chinese] zh-CN Xiaoxiao",
speed=1.2 speed=1.2
) )

View File

@@ -54,7 +54,9 @@ class VideoGeneratorService:
# === Basic Config === # === Basic Config ===
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
voice_id: str = "zh-CN-YunjianNeural", voice_id: str = "[Chinese] zh-CN Yunjian",
tts_workflow: Optional[str] = None,
tts_speed: float = 1.2,
output_path: Optional[str] = None, output_path: Optional[str] = None,
# === LLM Parameters === # === LLM Parameters ===
@@ -111,7 +113,9 @@ class VideoGeneratorService:
n_scenes: Number of storyboard scenes (default 5) n_scenes: Number of storyboard scenes (default 5)
Only effective in generate mode; ignored in fixed mode Only effective in generate mode; ignored in fixed mode
voice_id: TTS voice ID (default "zh-CN-YunjianNeural") voice_id: TTS voice ID (default "[Chinese] zh-CN Yunjian")
tts_workflow: TTS workflow filename (e.g., "tts_edge.json", None = use default)
tts_speed: TTS speed multiplier (1.0 = normal, 1.2 = 20% faster, default 1.2)
output_path: Output video path (auto-generated if None) output_path: Output video path (auto-generated if None)
min_narration_words: Min narration length (generate mode only) min_narration_words: Min narration length (generate mode only)
@@ -219,6 +223,8 @@ class VideoGeneratorService:
video_height=video_height, video_height=video_height,
video_fps=video_fps, video_fps=video_fps,
voice_id=voice_id, voice_id=voice_id,
tts_workflow=tts_workflow,
tts_speed=tts_speed,
image_width=image_width, image_width=image_width,
image_height=image_height, image_height=image_height,
image_workflow=image_workflow, image_workflow=image_workflow,
@@ -259,7 +265,8 @@ class VideoGeneratorService:
# Override prompt_prefix if provided (temporarily modify config) # Override prompt_prefix if provided (temporarily modify config)
original_prefix = None original_prefix = None
if prompt_prefix is not None: if prompt_prefix is not None:
image_config = self.core.config.get("image", {}) # Fix: image config is under comfyui.image, not directly under config
image_config = self.core.config.get("comfyui", {}).get("image", {})
original_prefix = image_config.get("prompt_prefix") original_prefix = image_config.get("prompt_prefix")
image_config["prompt_prefix"] = prompt_prefix image_config["prompt_prefix"] = prompt_prefix
logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'") logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")

View File

@@ -31,7 +31,7 @@ _request_semaphore = asyncio.Semaphore(_MAX_CONCURRENT_REQUESTS)
async def edge_tts( async def edge_tts(
text: str, text: str,
voice: str = "zh-CN-YunjianNeural", voice: str = "[Chinese] zh-CN Yunjian",
rate: str = "+0%", rate: str = "+0%",
volume: str = "+0%", volume: str = "+0%",
pitch: str = "+0Hz", pitch: str = "+0Hz",
@@ -53,7 +53,7 @@ async def edge_tts(
Args: Args:
text: Text to convert to speech text: Text to convert to speech
voice: Voice ID (e.g., zh-CN-YunjianNeural, en-US-JennyNeural) voice: Voice ID (e.g., [Chinese] zh-CN Yunjian, [English] en-US Jenny)
rate: Speech rate (e.g., +0%, +50%, -20%) rate: Speech rate (e.g., +0%, +50%, -20%)
volume: Speech volume (e.g., +0%, +50%, -20%) volume: Speech volume (e.g., +0%, +50%, -20%)
pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz) pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
@@ -65,20 +65,20 @@ async def edge_tts(
Audio data as bytes (MP3 format) Audio data as bytes (MP3 format)
Popular Chinese voices: Popular Chinese voices:
- zh-CN-YunjianNeural (male, default) - [Chinese] zh-CN Yunjian (male, default)
- zh-CN-XiaoxiaoNeural (female) - [Chinese] zh-CN Xiaoxiao (female)
- zh-CN-YunxiNeural (male) - [Chinese] zh-CN Yunxi (male)
- zh-CN-XiaoyiNeural (female) - [Chinese] zh-CN Xiaoyi (female)
Popular English voices: Popular English voices:
- en-US-JennyNeural (female) - [English] en-US Jenny (female)
- en-US-GuyNeural (male) - [English] en-US Guy (male)
- en-GB-SoniaNeural (female, British) - [English] en-GB Sonia (female, British)
Example: Example:
audio_bytes = await edge_tts( audio_bytes = await edge_tts(
text="你好,世界!", text="你好,世界!",
voice="zh-CN-YunjianNeural", voice="[Chinese] zh-CN Yunjian",
rate="+20%" rate="+20%"
) )
""" """
@@ -235,11 +235,11 @@ async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry
Example: Example:
# List all voices # List all voices
voices = await list_voices() voices = await list_voices()
# Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...] # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
# List Chinese voices only # List Chinese voices only
voices = await list_voices(locale="zh-CN") voices = await list_voices(locale="zh-CN")
# Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...] # Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
""" """
logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}") logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")

View File

@@ -459,7 +459,7 @@ def main():
else: else:
tts_workflow_key = "selfhost/tts_edge.json" # fallback tts_workflow_key = "selfhost/tts_edge.json" # fallback
# TTS preview expander (similar to image preview) # TTS preview expander (simplified, uses default voice and speed)
with st.expander(tr("tts.preview_title"), expanded=False): with st.expander(tr("tts.preview_title"), expanded=False):
# Preview text input # Preview text input
preview_text = st.text_input( preview_text = st.text_input(
@@ -473,7 +473,7 @@ def main():
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True): if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
with st.spinner(tr("tts.previewing")): with st.spinner(tr("tts.previewing")):
try: try:
# Generate preview audio using selected workflow # Generate preview audio using selected workflow (use default voice and speed)
audio_path = run_async(reelforge.tts( audio_path = run_async(reelforge.tts(
text=preview_text, text=preview_text,
workflow=tts_workflow_key workflow=tts_workflow_key