更新TTS相关配置,调整语音ID格式,优化工作流参数,确保一致性和可读性。
This commit is contained in:
@@ -23,7 +23,7 @@ async def tts_synthesize(
|
|||||||
Convert text to speech audio.
|
Convert text to speech audio.
|
||||||
|
|
||||||
- **text**: Text to synthesize
|
- **text**: Text to synthesize
|
||||||
- **voice_id**: Voice ID (e.g., 'zh-CN-YunjianNeural', 'en-US-AriaNeural')
|
- **voice_id**: Voice ID (e.g., '[Chinese] zh-CN Yunjian', '[English] en-US Aria')
|
||||||
|
|
||||||
Returns path to generated audio file and duration.
|
Returns path to generated audio file and duration.
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -8,13 +8,13 @@ from pydantic import BaseModel, Field
|
|||||||
class TTSSynthesizeRequest(BaseModel):
|
class TTSSynthesizeRequest(BaseModel):
|
||||||
"""TTS synthesis request"""
|
"""TTS synthesis request"""
|
||||||
text: str = Field(..., description="Text to synthesize")
|
text: str = Field(..., description="Text to synthesize")
|
||||||
voice_id: str = Field("zh-CN-YunjianNeural", description="Voice ID")
|
voice_id: str = Field("[Chinese] zh-CN Yunjian", description="Voice ID")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
json_schema_extra = {
|
json_schema_extra = {
|
||||||
"example": {
|
"example": {
|
||||||
"text": "Hello, welcome to ReelForge!",
|
"text": "Hello, welcome to ReelForge!",
|
||||||
"voice_id": "zh-CN-YunjianNeural"
|
"voice_id": "[Chinese] zh-CN Yunjian"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ class VideoGenerateRequest(BaseModel):
|
|||||||
|
|
||||||
# === Basic Config ===
|
# === Basic Config ===
|
||||||
n_scenes: int = Field(5, ge=1, le=20, description="Number of scenes (generate mode only)")
|
n_scenes: int = Field(5, ge=1, le=20, description="Number of scenes (generate mode only)")
|
||||||
voice_id: str = Field("zh-CN-YunjianNeural", description="TTS voice ID")
|
voice_id: str = Field("[Chinese] zh-CN Yunjian", description="TTS voice ID")
|
||||||
|
|
||||||
# === LLM Parameters ===
|
# === LLM Parameters ===
|
||||||
min_narration_words: int = Field(5, ge=1, le=100, description="Min narration words")
|
min_narration_words: int = Field(5, ge=1, le=100, description="Min narration words")
|
||||||
@@ -57,7 +57,7 @@ class VideoGenerateRequest(BaseModel):
|
|||||||
"text": "Atomic Habits teaches us that small changes compound over time to produce remarkable results.",
|
"text": "Atomic Habits teaches us that small changes compound over time to produce remarkable results.",
|
||||||
"mode": "generate",
|
"mode": "generate",
|
||||||
"n_scenes": 5,
|
"n_scenes": 5,
|
||||||
"voice_id": "zh-CN-YunjianNeural",
|
"voice_id": "[Chinese] zh-CN Yunjian",
|
||||||
"title": "The Power of Atomic Habits"
|
"title": "The Power of Atomic Habits"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ comfyui:
|
|||||||
|
|
||||||
# TTS-specific configuration
|
# TTS-specific configuration
|
||||||
tts:
|
tts:
|
||||||
default: selfhost/tts_edge.json # TTS workflow to use
|
default_workflow: selfhost/tts_edge.json # TTS workflow to use
|
||||||
|
|
||||||
# Image-specific configuration
|
# Image-specific configuration
|
||||||
image:
|
image:
|
||||||
|
|||||||
@@ -15,16 +15,12 @@ class LLMConfig(BaseModel):
|
|||||||
|
|
||||||
class TTSSubConfig(BaseModel):
|
class TTSSubConfig(BaseModel):
|
||||||
"""TTS-specific configuration (under comfyui.tts)"""
|
"""TTS-specific configuration (under comfyui.tts)"""
|
||||||
model_config = {"populate_by_name": True} # Allow both field name and alias
|
default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)")
|
||||||
|
|
||||||
default_workflow: str = Field(default=None, description="Default TTS workflow (required, no fallback)", alias="default")
|
|
||||||
|
|
||||||
|
|
||||||
class ImageSubConfig(BaseModel):
|
class ImageSubConfig(BaseModel):
|
||||||
"""Image-specific configuration (under comfyui.image)"""
|
"""Image-specific configuration (under comfyui.image)"""
|
||||||
model_config = {"populate_by_name": True} # Allow both field name and alias
|
default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)")
|
||||||
|
|
||||||
default_workflow: str = Field(default=None, description="Default image workflow (required, no fallback)", alias="default")
|
|
||||||
prompt_prefix: str = Field(
|
prompt_prefix: str = Field(
|
||||||
default="Pure white background, minimalist illustration, matchstick figure style, black and white line drawing, simple clean lines",
|
default="Pure white background, minimalist illustration, matchstick figure style, black and white line drawing, simple clean lines",
|
||||||
description="Prompt prefix for all image generation"
|
description="Prompt prefix for all image generation"
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ class StoryboardConfig:
|
|||||||
video_fps: int = 30 # Frame rate
|
video_fps: int = 30 # Frame rate
|
||||||
|
|
||||||
# Audio parameters
|
# Audio parameters
|
||||||
voice_id: str = "zh-CN-YunjianNeural" # Default voice
|
voice_id: str = "[Chinese] zh-CN Yunjian" # Default voice
|
||||||
|
tts_workflow: Optional[str] = None # TTS workflow filename (None = use default)
|
||||||
|
tts_speed: float = 1.2 # TTS speed multiplier (1.0 = normal, >1.0 = faster)
|
||||||
|
|
||||||
# Image parameters
|
# Image parameters
|
||||||
image_width: int = 1024
|
image_width: int = 1024
|
||||||
|
|||||||
@@ -124,11 +124,12 @@ class FrameProcessor:
|
|||||||
from reelforge.utils.os_util import get_task_frame_path
|
from reelforge.utils.os_util import get_task_frame_path
|
||||||
output_path = get_task_frame_path(config.task_id, frame.index, "audio")
|
output_path = get_task_frame_path(config.task_id, frame.index, "audio")
|
||||||
|
|
||||||
# Call TTS with specific output path
|
# Call TTS with specific output path and workflow
|
||||||
audio_path = await self.core.tts(
|
audio_path = await self.core.tts(
|
||||||
text=frame.narration,
|
text=frame.narration,
|
||||||
|
workflow=config.tts_workflow, # Use workflow from config
|
||||||
voice=config.voice_id,
|
voice=config.voice_id,
|
||||||
rate="+20%",
|
speed=config.tts_speed, # Use speed (not rate) from config
|
||||||
output_path=output_path,
|
output_path=output_path,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -116,8 +116,8 @@ class ImagePromptGeneratorService:
|
|||||||
# 5. Apply prompt prefix to each prompt
|
# 5. Apply prompt prefix to each prompt
|
||||||
from reelforge.utils.prompt_helper import build_image_prompt
|
from reelforge.utils.prompt_helper import build_image_prompt
|
||||||
|
|
||||||
# Get prompt prefix from config
|
# Get prompt prefix from config (fix: correct path is comfyui.image.prompt_prefix)
|
||||||
image_config = self.core.config.get("image", {})
|
image_config = self.core.config.get("comfyui", {}).get("image", {})
|
||||||
prompt_prefix = image_config.get("prompt_prefix", "")
|
prompt_prefix = image_config.get("prompt_prefix", "")
|
||||||
|
|
||||||
# Apply prefix to each base prompt
|
# Apply prefix to each base prompt
|
||||||
|
|||||||
@@ -52,8 +52,8 @@ class TTSService(ComfyBaseService):
|
|||||||
comfyui_url: Optional[str] = None,
|
comfyui_url: Optional[str] = None,
|
||||||
runninghub_api_key: Optional[str] = None,
|
runninghub_api_key: Optional[str] = None,
|
||||||
# TTS parameters
|
# TTS parameters
|
||||||
voice: Optional[str] = None,
|
voice: str = "[Chinese] zh-CN Yunjian",
|
||||||
speed: float = 1.0,
|
speed: float = 1.2,
|
||||||
# Output path
|
# Output path
|
||||||
output_path: Optional[str] = None,
|
output_path: Optional[str] = None,
|
||||||
**params
|
**params
|
||||||
@@ -88,7 +88,7 @@ class TTSService(ComfyBaseService):
|
|||||||
audio_path = await reelforge.tts(
|
audio_path = await reelforge.tts(
|
||||||
text="Hello",
|
text="Hello",
|
||||||
workflow="tts_edge.json",
|
workflow="tts_edge.json",
|
||||||
voice="zh-CN-XiaoxiaoNeural",
|
voice="[Chinese] zh-CN Xiaoxiao",
|
||||||
speed=1.2
|
speed=1.2
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,9 @@ class VideoGeneratorService:
|
|||||||
|
|
||||||
# === Basic Config ===
|
# === Basic Config ===
|
||||||
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
|
n_scenes: int = 5, # Only used in generate mode; ignored in fixed mode
|
||||||
voice_id: str = "zh-CN-YunjianNeural",
|
voice_id: str = "[Chinese] zh-CN Yunjian",
|
||||||
|
tts_workflow: Optional[str] = None,
|
||||||
|
tts_speed: float = 1.2,
|
||||||
output_path: Optional[str] = None,
|
output_path: Optional[str] = None,
|
||||||
|
|
||||||
# === LLM Parameters ===
|
# === LLM Parameters ===
|
||||||
@@ -111,7 +113,9 @@ class VideoGeneratorService:
|
|||||||
n_scenes: Number of storyboard scenes (default 5)
|
n_scenes: Number of storyboard scenes (default 5)
|
||||||
Only effective in generate mode; ignored in fixed mode
|
Only effective in generate mode; ignored in fixed mode
|
||||||
|
|
||||||
voice_id: TTS voice ID (default "zh-CN-YunjianNeural")
|
voice_id: TTS voice ID (default "[Chinese] zh-CN Yunjian")
|
||||||
|
tts_workflow: TTS workflow filename (e.g., "tts_edge.json", None = use default)
|
||||||
|
tts_speed: TTS speed multiplier (1.0 = normal, 1.2 = 20% faster, default 1.2)
|
||||||
output_path: Output video path (auto-generated if None)
|
output_path: Output video path (auto-generated if None)
|
||||||
|
|
||||||
min_narration_words: Min narration length (generate mode only)
|
min_narration_words: Min narration length (generate mode only)
|
||||||
@@ -219,6 +223,8 @@ class VideoGeneratorService:
|
|||||||
video_height=video_height,
|
video_height=video_height,
|
||||||
video_fps=video_fps,
|
video_fps=video_fps,
|
||||||
voice_id=voice_id,
|
voice_id=voice_id,
|
||||||
|
tts_workflow=tts_workflow,
|
||||||
|
tts_speed=tts_speed,
|
||||||
image_width=image_width,
|
image_width=image_width,
|
||||||
image_height=image_height,
|
image_height=image_height,
|
||||||
image_workflow=image_workflow,
|
image_workflow=image_workflow,
|
||||||
@@ -259,7 +265,8 @@ class VideoGeneratorService:
|
|||||||
# Override prompt_prefix if provided (temporarily modify config)
|
# Override prompt_prefix if provided (temporarily modify config)
|
||||||
original_prefix = None
|
original_prefix = None
|
||||||
if prompt_prefix is not None:
|
if prompt_prefix is not None:
|
||||||
image_config = self.core.config.get("image", {})
|
# Fix: image config is under comfyui.image, not directly under config
|
||||||
|
image_config = self.core.config.get("comfyui", {}).get("image", {})
|
||||||
original_prefix = image_config.get("prompt_prefix")
|
original_prefix = image_config.get("prompt_prefix")
|
||||||
image_config["prompt_prefix"] = prompt_prefix
|
image_config["prompt_prefix"] = prompt_prefix
|
||||||
logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")
|
logger.info(f"Using custom prompt_prefix: '{prompt_prefix}'")
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ _request_semaphore = asyncio.Semaphore(_MAX_CONCURRENT_REQUESTS)
|
|||||||
|
|
||||||
async def edge_tts(
|
async def edge_tts(
|
||||||
text: str,
|
text: str,
|
||||||
voice: str = "zh-CN-YunjianNeural",
|
voice: str = "[Chinese] zh-CN Yunjian",
|
||||||
rate: str = "+0%",
|
rate: str = "+0%",
|
||||||
volume: str = "+0%",
|
volume: str = "+0%",
|
||||||
pitch: str = "+0Hz",
|
pitch: str = "+0Hz",
|
||||||
@@ -53,7 +53,7 @@ async def edge_tts(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to convert to speech
|
text: Text to convert to speech
|
||||||
voice: Voice ID (e.g., zh-CN-YunjianNeural, en-US-JennyNeural)
|
voice: Voice ID (e.g., [Chinese] zh-CN Yunjian, [English] en-US Jenny)
|
||||||
rate: Speech rate (e.g., +0%, +50%, -20%)
|
rate: Speech rate (e.g., +0%, +50%, -20%)
|
||||||
volume: Speech volume (e.g., +0%, +50%, -20%)
|
volume: Speech volume (e.g., +0%, +50%, -20%)
|
||||||
pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
|
pitch: Speech pitch (e.g., +0Hz, +10Hz, -5Hz)
|
||||||
@@ -65,20 +65,20 @@ async def edge_tts(
|
|||||||
Audio data as bytes (MP3 format)
|
Audio data as bytes (MP3 format)
|
||||||
|
|
||||||
Popular Chinese voices:
|
Popular Chinese voices:
|
||||||
- zh-CN-YunjianNeural (male, default)
|
- [Chinese] zh-CN Yunjian (male, default)
|
||||||
- zh-CN-XiaoxiaoNeural (female)
|
- [Chinese] zh-CN Xiaoxiao (female)
|
||||||
- zh-CN-YunxiNeural (male)
|
- [Chinese] zh-CN Yunxi (male)
|
||||||
- zh-CN-XiaoyiNeural (female)
|
- [Chinese] zh-CN Xiaoyi (female)
|
||||||
|
|
||||||
Popular English voices:
|
Popular English voices:
|
||||||
- en-US-JennyNeural (female)
|
- [English] en-US Jenny (female)
|
||||||
- en-US-GuyNeural (male)
|
- [English] en-US Guy (male)
|
||||||
- en-GB-SoniaNeural (female, British)
|
- [English] en-GB Sonia (female, British)
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
audio_bytes = await edge_tts(
|
audio_bytes = await edge_tts(
|
||||||
text="你好,世界!",
|
text="你好,世界!",
|
||||||
voice="zh-CN-YunjianNeural",
|
voice="[Chinese] zh-CN Yunjian",
|
||||||
rate="+20%"
|
rate="+20%"
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
@@ -235,11 +235,11 @@ async def list_voices(locale: str = None, retry_count: int = _RETRY_COUNT, retry
|
|||||||
Example:
|
Example:
|
||||||
# List all voices
|
# List all voices
|
||||||
voices = await list_voices()
|
voices = await list_voices()
|
||||||
# Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...]
|
# Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
|
||||||
|
|
||||||
# List Chinese voices only
|
# List Chinese voices only
|
||||||
voices = await list_voices(locale="zh-CN")
|
voices = await list_voices(locale="zh-CN")
|
||||||
# Returns: ['zh-CN-YunjianNeural', 'zh-CN-XiaoxiaoNeural', ...]
|
# Returns: ['[Chinese] zh-CN Yunjian', '[Chinese] zh-CN Xiaoxiao', ...]
|
||||||
"""
|
"""
|
||||||
logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")
|
logger.debug(f"Fetching Edge TTS voices, locale filter: {locale}, retry_count: {retry_count}")
|
||||||
|
|
||||||
|
|||||||
@@ -459,7 +459,7 @@ def main():
|
|||||||
else:
|
else:
|
||||||
tts_workflow_key = "selfhost/tts_edge.json" # fallback
|
tts_workflow_key = "selfhost/tts_edge.json" # fallback
|
||||||
|
|
||||||
# TTS preview expander (similar to image preview)
|
# TTS preview expander (simplified, uses default voice and speed)
|
||||||
with st.expander(tr("tts.preview_title"), expanded=False):
|
with st.expander(tr("tts.preview_title"), expanded=False):
|
||||||
# Preview text input
|
# Preview text input
|
||||||
preview_text = st.text_input(
|
preview_text = st.text_input(
|
||||||
@@ -473,7 +473,7 @@ def main():
|
|||||||
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
|
if st.button(tr("tts.preview_button"), key="preview_tts", use_container_width=True):
|
||||||
with st.spinner(tr("tts.previewing")):
|
with st.spinner(tr("tts.previewing")):
|
||||||
try:
|
try:
|
||||||
# Generate preview audio using selected workflow
|
# Generate preview audio using selected workflow (use default voice and speed)
|
||||||
audio_path = run_async(reelforge.tts(
|
audio_path = run_async(reelforge.tts(
|
||||||
text=preview_text,
|
text=preview_text,
|
||||||
workflow=tts_workflow_key
|
workflow=tts_workflow_key
|
||||||
|
|||||||
Reference in New Issue
Block a user